"vscode:/vscode.git/clone" did not exist on "3b079ec3fadfc95240bc1c48ae86de28b72cc9f2"
Unverified Commit d4f6ceb7 authored by Bengt Lüers's avatar Bengt Lüers Committed by GitHub
Browse files

Merge pull request #1 from RadeonOpenCompute/master

Pull upstream
parents be6a8ba0 981810fe
...@@ -27,6 +27,12 @@ cmake_minimum_required(VERSION 2.8.0) ...@@ -27,6 +27,12 @@ cmake_minimum_required(VERSION 2.8.0)
# #
# #
# Specify name of project to build, install and package
set(PROJECT_NAME "rocm-bandwidth-test")
set(TEST_NAME "${PROJECT_NAME}")
project(${PROJECT_NAME})
# Build is not supported on Windows plaform # Build is not supported on Windows plaform
if(WIN32) if(WIN32)
message("Windows platfom is not supported") message("Windows platfom is not supported")
...@@ -56,7 +62,6 @@ endif() ...@@ -56,7 +62,6 @@ endif()
set(CMAKE_CXX_FLAGS "-std=c++11") set(CMAKE_CXX_FLAGS "-std=c++11")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-dev")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
...@@ -103,61 +108,11 @@ elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86") ...@@ -103,61 +108,11 @@ elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
endif() endif()
# Specify name of project to build, install and package
set(PROJECT_NAME "rocm_bandwidth_test")
set(TEST_NAME "${PROJECT_NAME}")
project(${PROJECT_NAME})
# Set project requirements # Set project requirements
set(ROC_THUNK_NAME "hsakmt") set(ROC_THUNK_NAME "hsakmt")
set(CORE_RUNTIME_NAME "hsa-runtime") set(CORE_RUNTIME_NAME "hsa-runtime")
set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64")
# Bind default root directory to look for ROCm artifacts
# such as ROCr header and ROCr, ROCt libraries
set(ROCM_ROOT /opt/rocm/ CACHE PATH "Root of ROCm")
if(DEFINED ENV{CMAKE_PREFIX_PATH})
set(CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH})
endif()
if(CMAKE_PREFIX_PATH)
set(ROCM_ROOT ${CMAKE_PREFIX_PATH} CACHE PATH "Root of ROCm")
endif()
# Search for ROCr header file
find_path(ROCR_HDR hsa/hsa.h PATHS ${ROCM_ROOT} PATH_SUFFIXES include )
if (NOT ROCR_HDR)
message("Rocr Header hsa/hsa.h not found")
return()
endif()
# Add directories to look for header files to compile
INCLUDE_DIRECTORIES(${ROCR_HDR})
# Search for ROCr library file
find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS ${ROCM_ROOT} PATH_SUFFIXES lib lib64)
if (NOT ROCR_LIB)
message("Rocr Library ${CORE_RUNTIME_TARGET} not found")
return()
endif()
# Search for ROCr library file
find_library(ROCT_LIB ${ROC_THUNK_NAME} PATHS ${ROCM_ROOT} PATH_SUFFIXES lib lib64)
if (NOT ROCT_LIB)
message("Roct Library ${ROC_THUNK_NAME} not found")
return()
endif()
# Add ROCr library to be used in linking target
add_library(${CORE_RUNTIME_TARGET} SHARED IMPORTED GLOBAL)
set_target_properties(${CORE_RUNTIME_TARGET} PROPERTIES
IMPORTED_LOCATION "${ROCR_LIB}"
INTERFACE_INCLUDE_DIRECTORIES "${ROCR_HDR}")
# Add ROCr library to be used in linking target
add_library(${ROC_THUNK_NAME} SHARED IMPORTED GLOBAL)
set_target_properties(${ROC_THUNK_NAME} PROPERTIES
IMPORTED_LOCATION "${ROCT_LIB}")
# Add cmake_modules to default module path if it is not # Add cmake_modules to default module path if it is not
# already set and include utils from cmake modules # already set and include utils from cmake modules
if(NOT DEFINED CMAKE_MODULE_PATH) if(NOT DEFINED CMAKE_MODULE_PATH)
...@@ -165,12 +120,29 @@ if(NOT DEFINED CMAKE_MODULE_PATH) ...@@ -165,12 +120,29 @@ if(NOT DEFINED CMAKE_MODULE_PATH)
endif() endif()
include(utils) include(utils)
# Making find_package(has-runtime64 Optional as it can fail when building old hsa
# When find_package fails, then using old method of find_libraries for
# searching the required libs for building RBT
find_package(hsa-runtime64
PATHS /opt/rocm )
if(${hsa-runtime64_FOUND})
message("hsa-runtime64 found @ ${hsa-runtime64_DIR} " )
else()
message("hsa-runtime64 NOT found Resolving to OLD Way" )
find_path(ROCR_HDR hsa.h PATHS "/opt/rocm" PATH_SUFFIXES include/hsa REQUIRED )
INCLUDE_DIRECTORIES(${ROCR_HDR})
# Search for ROCr library file
find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED)
# Search for ROCt library file
find_library(ROCT_LIB ${ROC_THUNK_NAME} PATHS "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED)
endif()
# #
# Set the package version for the test. It is critical that this # Set the package version for the test. It is critical that this
# value track what is used in the test source. The code from utils # value track what is used in the test source. The code from utils
# module will parse the string into major, minor and patch sub-fields # module will parse the string into major, minor and patch sub-fields
# #
get_version("1.0.0") get_version()
# Bind the Major, Minor and Patch values # Bind the Major, Minor and Patch values
set(BUILD_VERSION_MAJOR ${VERSION_MAJOR}) set(BUILD_VERSION_MAJOR ${VERSION_MAJOR})
...@@ -192,9 +164,18 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} Src) ...@@ -192,9 +164,18 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} Src)
# Build and link the test program # Build and link the test program
add_executable(${TEST_NAME} ${Src}) add_executable(${TEST_NAME} ${Src})
target_link_libraries(${TEST_NAME} ${ROC_THUNK_NAME}) if(${hsa-runtime64_FOUND})
target_link_libraries(${TEST_NAME} ${CORE_RUNTIME_TARGET}) target_link_libraries(${TEST_NAME} PRIVATE hsa-runtime64::hsa-runtime64)
target_link_libraries(${TEST_NAME} c stdc++ dl pthread rt) else()
target_link_libraries(${TEST_NAME} PRIVATE ${ROCR_LIB} ${ROCT_LIB} )
endif()
target_link_libraries(${TEST_NAME} PRIVATE c stdc++ dl pthread rt)
# Update linker flags to include RPATH
# Add --enable-new-dtags to generate DT_RUNPATH
if( DEFINED ENV{ROCM_RPATH})
set ( CMAKE_EXE_LINKER_FLAGS "-Wl,--enable-new-dtags -Wl,--rpath,$ENV{ROCM_RPATH}" )
endif()
# Add install directives for rocm_bandwidth_test # Add install directives for rocm_bandwidth_test
install(TARGETS ${TEST_NAME} RUNTIME DESTINATION bin) install(TARGETS ${TEST_NAME} RUNTIME DESTINATION bin)
...@@ -209,8 +190,12 @@ set(CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc.") ...@@ -209,8 +190,12 @@ set(CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc.")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Test to measure PciE bandwidth on ROCm platforms") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Test to measure PciE bandwidth on ROCm platforms")
# Debian package specific variables # Debian package specific variables
set(CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6, hsa-rocr-dev" )
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/rocm_bandwidth_test") set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/rocm_bandwidth_test")
# RPM package specific variables
set(CPACK_RPM_PACKAGE_DEPENDS "libstdc++6, hsa-rocr-dev" )
# RPM package specific variables # RPM package specific variables
if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX) if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX)
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX} ${CPACK_PACKAGING_INSTALL_PREFIX}/bin") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX} ${CPACK_PACKAGING_INSTALL_PREFIX}/bin")
......
File mode changed from 100644 to 100755
...@@ -31,16 +31,23 @@ The following simply lists the steps to build RocBandwidthTest ...@@ -31,16 +31,23 @@ The following simply lists the steps to build RocBandwidthTest
The argument for cmake should be the root folder of RocBandwidthTest The argument for cmake should be the root folder of RocBandwidthTest
test suite test suite
// Assume that Roc Runtime has its libraries & headers are located in the path :
// libraries : _ABSOLUTE_PATH_TO_ROCR_LIBS_/lib
// headers : _ABSOLUTE_PATH_TO_ROCR_LIBS_/include/hsa
// Note : Observe that both include & lib folder are under common path (_ABSOLUTE_PATH_TO_ROCR_LIBS_)
// Builds Debug version // Builds Debug version
// Assumes pwd is .../roc_bandwidth_test/build // Assumes pwd is _ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/build
e.g. cmake -DROCR_LIB_DIR="Path of ROC Runtime Library Files" \
-DROCR_INC_DIR="Path of ROC Runtime Header Files" \ e.g. cmake -DCMAKE_BUILD_TYPE="Debug" \
-DCMAKE_BUILD_TYPE:STRING=Debug .. -DCMAKE_MODULE_PATH="_ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/cmake_modules" \
-DCMAKE_PREFIX_PATH="_ABSOLUTE_PATH_TO_ROCR_LIBS_"
..
// Builds Release version - default // Builds Release version - default
// Assumes pwd is .../roc_bandwidth_test/build // Assumes pwd is _ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/build
e.g. cmake -DROCR_LIB_DIR="Path of ROC Runtime Library Files" \ e.g. cmake -DCMAKE_MODULE_PATH="_ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/cmake_modules" \
-DROCR_INC_DIR="Path of ROC Runtime Header Files" \ -DCMAKE_PREFIX_PATH="_ABSOLUTE_PATH_TO_ROCR_LIBS_"
.. ..
--- Invoke the native build rules generated by cmake to build the various --- Invoke the native build rules generated by cmake to build the various
......
File mode changed from 100644 to 100755
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
#ifndef ROC_BANDWIDTH_TEST_BASE_H_ #ifndef ROC_BANDWIDTH_TEST_BASE_H_
#define ROC_BANDWIDTH_TEST_BASE_H_ #define ROC_BANDWIDTH_TEST_BASE_H_
#include "hsa/hsa.h" #include "hsa.h"
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -56,7 +56,7 @@ class BaseTest { ...@@ -56,7 +56,7 @@ class BaseTest {
public: public:
BaseTest(size_t num = 3); BaseTest(size_t num_iter = 4);
virtual ~BaseTest(); virtual ~BaseTest();
...@@ -74,8 +74,8 @@ class BaseTest { ...@@ -74,8 +74,8 @@ class BaseTest {
virtual void Display() const = 0; virtual void Display() const = 0;
// @Brief: Set number of iterations to run // @Brief: Set number of iterations to run
void set_num_iteration(size_t num) { void set_num_iteration(size_t num_iter) {
num_iteration_ = num; num_iteration_ = num_iter;
return; return;
} }
......
...@@ -45,39 +45,67 @@ ...@@ -45,39 +45,67 @@
## the major, minor and patch variables. ## the major, minor and patch variables.
function( parse_version VERSION_STRING ) function( parse_version VERSION_STRING )
# Get index of '-' character in input string
string ( FIND ${VERSION_STRING} "-" STRING_INDEX ) string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
# If there is string after '-' character, capture
# it in COMMIT_INFO string
if ( ${STRING_INDEX} GREATER -1 ) if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" ) math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD ) string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 COMMIT_INFO )
endif () endif ()
string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} ) # Parse string into tokens that consist of only numerical
list ( LENGTH VERSIONS VERSION_COUNT ) # substrings and obtain it as a list
string ( REGEX MATCHALL "[0123456789]+" TOKENS ${VERSION_STRING} )
list ( LENGTH TOKENS TOKEN_COUNT )
if ( ${VERSION_COUNT} GREATER 0) # Get Major Id of the version
list ( GET VERSIONS 0 MAJOR ) if ( ${TOKEN_COUNT} GREATER 0)
list ( GET TOKENS 0 MAJOR )
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE ) set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${MAJOR}" )
endif () endif ()
if ( ${VERSION_COUNT} GREATER 1 ) # Get Minor Id of the version
list ( GET VERSIONS 1 MINOR ) if ( ${TOKEN_COUNT} GREATER 1 )
list ( GET TOKENS 1 MINOR )
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE ) set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" )
endif () endif ()
if ( ${VERSION_COUNT} GREATER 2 ) # Get Patch Id of the version
list ( GET VERSIONS 2 PATCH ) if ( ${TOKEN_COUNT} GREATER 2 )
list ( GET TOKENS 2 PATCH )
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE ) set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" )
endif () endif ()
if ( DEFINED VERSION_BUILD ) # Return if commit info is not present
set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) if ( NOT DEFINED COMMIT_INFO )
return()
endif()
# Parse Commit string if present into number of
# commits and hash of last commit
string ( FIND ${COMMIT_INFO} "-" STRING_INDEX )
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${COMMIT_INFO} ${STRING_INDEX} -1 COMMIT_HASH )
endif () endif ()
set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE ) string ( REGEX MATCHALL "[0123456789]+" TOKENS ${COMMIT_INFO} )
list ( LENGTH TOKENS TOKEN_COUNT )
if ( ${TOKEN_COUNT} GREATER 0)
list ( GET TOKENS 0 COMMIT_CNT )
endif ()
# Add Build Info from Jenkins
set ( ROCM_BUILD_ID "DevBld" CACHE STRING "Local Build Id" FORCE )
if(DEFINED ENV{ROCM_BUILD_ID})
set ( ROCM_BUILD_ID $ENV{ROCM_BUILD_ID} CACHE STRING "Jenkins Build Id" FORCE )
endif()
# Update Version Patch to include Number of Commits and hash of HEAD
set ( VERSION_PATCH "${PATCH}.${COMMIT_CNT}-${ROCM_BUILD_ID}-${COMMIT_HASH}" PARENT_SCOPE )
endfunction () endfunction ()
...@@ -85,31 +113,27 @@ endfunction () ...@@ -85,31 +113,27 @@ endfunction ()
## using versioning tags and git describe. ## using versioning tags and git describe.
## Passes back a packaging version string ## Passes back a packaging version string
## and a library version string. ## and a library version string.
function ( get_version DEFAULT_VERSION_STRING ) function ( get_version )
parse_version ( ${DEFAULT_VERSION_STRING} )
# Bind the program git that will be
# used to query its tag that describes
find_program ( GIT NAMES git ) find_program ( GIT NAMES git )
if ( GIT ) if ( GIT )
execute_process ( COMMAND "git describe --dirty --long --match [0-9]* 2> /dev/null" execute_process ( COMMAND git describe --long --match [0-9]*
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_STRING OUTPUT_VARIABLE GIT_TAG_STRING
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT ) RESULT_VARIABLE RESULT )
if ( ${RESULT} EQUAL 0 ) if ( ${RESULT} EQUAL 0 )
parse_version ( ${GIT_TAG_STRING} ) parse_version ( ${GIT_TAG_STRING} )
endif () endif ()
endif () endif ()
set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE ) # Propagate values bound to parent scope
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE ) set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE ) set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE ) set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
endfunction() endfunction()
File mode changed from 100644 to 100755
...@@ -48,8 +48,8 @@ ...@@ -48,8 +48,8 @@
#include <vector> #include <vector>
#include <cmath> #include <cmath>
#include <stdio.h> #include <stdio.h>
#include "hsa/hsa.h" #include "hsa.h"
#include "hsa/hsa_ext_amd.h" #include "hsa_ext_amd.h"
using namespace std; using namespace std;
......
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include "hsatimer.hpp"
#define NANOSECONDS_PER_SECOND 1000000000
PerfTimer::PerfTimer() {
freq_in_100mhz = MeasureTSCFreqHz();
}
PerfTimer::~PerfTimer() {
while (!_timers.empty()) {
Timer *temp = _timers.back();
_timers.pop_back();
delete temp;
}
}
// Create a new timer instance and return its index
int PerfTimer::CreateTimer() {
Timer *newTimer = new Timer;
newTimer->_start = 0.0;
newTimer->_clocks = 0.0;
#ifdef __linux__
newTimer->_freq = NANOSECONDS_PER_SECOND;
#endif
// Save the timer object in timer list
_timers.push_back(newTimer);
return (int)(_timers.size() - 1);
}
int PerfTimer::StartTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef __linux__
// General Linux timing method
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
_timers[index]->_start =
(long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec;
// AMD Linux timing method
#else
unsigned int unused;
_timers[index]->_start = __rdtscp(&unused);
#endif
#endif
return HSA_SUCCESS;
}
int PerfTimer::StopTimer(int index) {
long long n = 0;
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef __linux__
// General Linux timing method
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
n = (long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec;
// AMD Linux timing
#else
unsigned int unused;
n = __rdtscp(&unused);
#endif
#endif
n -= _timers[index]->_start;
_timers[index]->_start = 0;
#ifndef _AMD
_timers[index]->_clocks += n;
#endif
#ifdef __linux__
//_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns
_timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms
// cout << "_AMD is enabled!!!" << endl;
#endif
return HSA_SUCCESS;
}
void PerfTimer::Error(string str) { cout << str << endl; }
double PerfTimer::ReadTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot read timer. Invalid handle.");
return HSA_FAILURE;
}
double reading = double(_timers[index]->_clocks);
reading = double(reading / _timers[index]->_freq);
return reading;
}
void PerfTimer::ResetTimer(int index) {
// Check if index value is over the timer's size
if (index >= (int)_timers.size()) {
Error("Invalid index value\n");
exit(1);
}
_timers[index]->_clocks = 0.0;
_timers[index]->_start = 0.0;
}
uint64_t PerfTimer::CoarseTimestampUs() {
#ifdef __linux__
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
#endif
}
uint64_t PerfTimer::MeasureTSCFreqHz() {
// Make a coarse interval measurement of TSC ticks for 1 gigacycles.
unsigned int unused;
uint64_t tscTicksEnd;
uint64_t coarseBeginUs = CoarseTimestampUs();
uint64_t tscTicksBegin = __rdtscp(&unused);
do {
tscTicksEnd = __rdtscp(&unused);
} while (tscTicksEnd - tscTicksBegin < 1000000000);
uint64_t coarseEndUs = CoarseTimestampUs();
// Compute the TSC frequency and round to nearest 100MHz.
uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000;
uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin;
return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs;
}
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef ROC_BANDWIDTH_TEST_MYTIME_H_
#define ROC_BANDWIDTH_TEST_MYTIME_H_
// Will use AMD timer and general Linux timer based on users'
// need --> compilation flag. Support for windows platform is
// not currently available
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <x86intrin.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <string>
using namespace std;
#include <sys/time.h>
#define HSA_FAILURE 1
#define HSA_SUCCESS 0
class PerfTimer {
private:
struct Timer {
string name; /* < name name of time object*/
long long _freq; /* < _freq frequency*/
long long _clocks; /* < _clocks number of ticks at end*/
long long _start; /* < _start start point ticks*/
};
std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
double freq_in_100mhz;
public:
PerfTimer();
~PerfTimer();
private:
// AMD timing method
uint64_t CoarseTimestampUs();
uint64_t MeasureTSCFreqHz();
// General Linux timing method
public:
int CreateTimer();
int StartTimer(int index);
int StopTimer(int index);
void ResetTimer(int index);
public:
// retrieve time
double ReadTimer(int index);
// write into a file
double WriteTimer(int index);
public:
void Error(string str);
};
#endif // ROC_BANDWIDTH_TEST_MYTIME_H_
...@@ -42,7 +42,6 @@ ...@@ -42,7 +42,6 @@
#include <unistd.h> #include <unistd.h>
#include <iostream> #include <iostream>
#include "hsatimer.hpp"
#include "rocm_bandwidth_test.hpp" #include "rocm_bandwidth_test.hpp"
using namespace std; using namespace std;
......
File mode changed from 100644 to 100755
...@@ -49,8 +49,14 @@ ...@@ -49,8 +49,14 @@
#include <unistd.h> #include <unistd.h>
#include <cctype> #include <cctype>
#include <cmath> #include <cmath>
#include <cstring>
#include <sstream> #include <sstream>
#include <limits> #include <limits>
#include <chrono>
#include <thread>
// Initialize the variable used to capture validation failure
const double RocmBandwidthTest::VALIDATE_COPY_OP_FAILURE = std::numeric_limits<double>::max();
// The values are in megabytes at allocation time // The values are in megabytes at allocation time
const size_t RocmBandwidthTest::SIZE_LIST[] = { 1 * 1024, const size_t RocmBandwidthTest::SIZE_LIST[] = { 1 * 1024,
...@@ -74,7 +80,7 @@ const size_t RocmBandwidthTest::LATENCY_SIZE_LIST[] = { 1, ...@@ -74,7 +80,7 @@ const size_t RocmBandwidthTest::LATENCY_SIZE_LIST[] = { 1,
256 * 1024, 512 * 1024 }; 256 * 1024, 512 * 1024 };
uint32_t RocmBandwidthTest::GetIterationNum() { uint32_t RocmBandwidthTest::GetIterationNum() {
return (validate_) ? 1 : (num_iteration_ * 1.2 + 1); return (validate_) ? 1 : (num_iteration_ + 1);
} }
void RocmBandwidthTest::AcquireAccess(hsa_agent_t agent, void* ptr) { void RocmBandwidthTest::AcquireAccess(hsa_agent_t agent, void* ptr) {
...@@ -92,7 +98,9 @@ void RocmBandwidthTest::AcquirePoolAcceses(uint32_t src_dev_idx, ...@@ -92,7 +98,9 @@ void RocmBandwidthTest::AcquirePoolAcceses(uint32_t src_dev_idx,
hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_;
if (src_dev_type == HSA_DEVICE_TYPE_GPU) { if (src_dev_type == HSA_DEVICE_TYPE_GPU) {
AcquireAccess(src_agent, dst); AcquireAccess(src_agent, dst);
} else if (dst_dev_type == HSA_DEVICE_TYPE_GPU) { }
if (dst_dev_type == HSA_DEVICE_TYPE_GPU) {
AcquireAccess(dst_agent, src); AcquireAccess(dst_agent, src);
} }
...@@ -115,20 +123,21 @@ void RocmBandwidthTest::InitializeSrcBuffer(size_t size, void* buf_cpy, ...@@ -115,20 +123,21 @@ void RocmBandwidthTest::InitializeSrcBuffer(size_t size, void* buf_cpy,
ErrorCheck(err_); ErrorCheck(err_);
} }
// If Copy device is a Gpu setup buffer access // If copying agent is a CPU, use memcpy to initialize copy buffer
hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_; hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_;
if (cpy_dev_type == HSA_DEVICE_TYPE_GPU) { if (cpy_dev_type == HSA_DEVICE_TYPE_CPU) {
std::memcpy(buf_cpy, init_src_, size);
return;
}
// Copying device is a Gpu, setup buffer access
// before copying initialization buffer
AcquireAccess(cpy_agent, init_src_); AcquireAccess(cpy_agent, init_src_);
hsa_signal_store_relaxed(init_signal_, 1); hsa_signal_store_relaxed(init_signal_, 1);
copy_buffer(buf_cpy, cpy_agent, copy_buffer(buf_cpy, cpy_agent,
init_src_, cpu_agent_, init_src_, cpu_agent_,
size, init_signal_); size, init_signal_);
return; return;
}
// Copy initialization buffer into copy buffer
memcpy(buf_cpy, init_src_, size);
return;
} }
bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, void* buf_cpy, bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, void* buf_cpy,
...@@ -141,7 +150,7 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi ...@@ -141,7 +150,7 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi
} }
// If Copy device is a Gpu setup buffer access // If Copy device is a Gpu setup buffer access
memset(validate_dst_, ~(0x23), curr_size); std::memset(validate_dst_, ~(0x23), curr_size);
hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_; hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_;
if (cpy_dev_type == HSA_DEVICE_TYPE_GPU) { if (cpy_dev_type == HSA_DEVICE_TYPE_GPU) {
AcquireAccess(cpy_agent, validate_dst_); AcquireAccess(cpy_agent, validate_dst_);
...@@ -153,11 +162,11 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi ...@@ -153,11 +162,11 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi
// Copying device is a CPU, copy dst buffer // Copying device is a CPU, copy dst buffer
// into validation buffer // into validation buffer
memcpy(validate_dst_, buf_cpy, curr_size); std::memcpy(validate_dst_, buf_cpy, curr_size);
} }
// Copy initialization buffer into copy buffer // Compare initialization buffer with validation buffer
err_ = (hsa_status_t)memcmp(init_src_, validate_dst_, curr_size); err_ = (hsa_status_t)std::memcmp(init_src_, validate_dst_, curr_size);
if (err_ != HSA_STATUS_SUCCESS) { if (err_ != HSA_STATUS_SUCCESS) {
exit_value_ = err_; exit_value_ = err_;
} }
...@@ -581,12 +590,17 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) { ...@@ -581,12 +590,17 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
hsa_signal_store_relaxed(signal_start_bidir, 1); hsa_signal_store_relaxed(signal_start_bidir, 1);
} }
// Create a timer object and reset signals // Temporary code for testing
PerfTimer timer; if (sleep_time_ > 0) {
uint32_t index = timer.CreateTimer(); std::this_thread::sleep_for(sleep_usecs_);
}
// Create a timer object and start it
if (print_cpu_time_) {
cpu_start_ = std::chrono::steady_clock::now();
}
// Start the timer and launch forward copy operation // Launch the copy operation
timer.StartTimer(index);
if (bidir == false) { if (bidir == false) {
err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd, err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd,
buf_src_fwd, src_agent_fwd, buf_src_fwd, src_agent_fwd,
...@@ -615,11 +629,13 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) { ...@@ -615,11 +629,13 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
WaitForCopyCompletion(signal_list); WaitForCopyCompletion(signal_list);
// Stop the timer object // Stop the timer object and extract time taken
timer.StopTimer(index); if (print_cpu_time_) {
cpu_end_ = std::chrono::steady_clock::now();
// Push the time taken for copy into a vector of copy times cpu_cp_time_ = cpu_end_ - cpu_start_;
cpu_time.push_back(timer.ReadTimer(index)); uint64_t cpu_temp = cpu_cp_time_.count();
cpu_time.push_back(cpu_temp);
}
// Collect time from the signal(s) // Collect time from the signal(s)
if (print_cpu_time_ == false) { if (print_cpu_time_ == false) {
...@@ -635,16 +651,25 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) { ...@@ -635,16 +651,25 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
} }
} }
// Get Cpu min copy time // Collecting Cpu time. Capture verify failures if any
trans.cpu_min_time_.push_back(GetMinTime(cpu_time)); // Get min and mean copy times and collect them into Cpu
// Get Cpu mean copy time and store to the array // time list
trans.cpu_avg_time_.push_back(GetMeanTime(cpu_time)); double min_time = 0;
double mean_time = 0;
if (print_cpu_time_) {
min_time = (verify) ? GetMinTime(cpu_time) : VALIDATE_COPY_OP_FAILURE;
mean_time = (verify) ? GetMeanTime(cpu_time) : VALIDATE_COPY_OP_FAILURE;
trans.cpu_min_time_.push_back(min_time);
trans.cpu_avg_time_.push_back(mean_time);
}
// Collecting Gpu time. Capture verify failures if any
// Get min and mean copy times and collect them into Gpu
// time list
if (print_cpu_time_ == false) { if (print_cpu_time_ == false) {
if (trans.copy.uses_gpu_) { if (trans.copy.uses_gpu_) {
// Get Gpu min and mean copy times min_time = (verify) ? GetMinTime(gpu_time) : VALIDATE_COPY_OP_FAILURE;
double min_time = (verify) ? GetMinTime(gpu_time) : std::numeric_limits<double>::max(); mean_time = (verify) ? GetMeanTime(gpu_time) : VALIDATE_COPY_OP_FAILURE;
double mean_time = (verify) ? GetMeanTime(gpu_time) : std::numeric_limits<double>::max();
trans.gpu_min_time_.push_back(min_time); trans.gpu_min_time_.push_back(min_time);
trans.gpu_avg_time_.push_back(mean_time); trans.gpu_avg_time_.push_back(mean_time);
} }
...@@ -652,7 +677,9 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) { ...@@ -652,7 +677,9 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
verify = true; verify = true;
// Clear the stack of cpu times // Clear the stack of cpu times
if (print_cpu_time_) {
cpu_time.clear(); cpu_time.clear();
}
gpu_time.clear(); gpu_time.clear();
} }
...@@ -788,19 +815,37 @@ RocmBandwidthTest::RocmBandwidthTest(int argc, char** argv) : BaseTest() { ...@@ -788,19 +815,37 @@ RocmBandwidthTest::RocmBandwidthTest(int argc, char** argv) : BaseTest() {
// Initialize version of the test // Initialize version of the test
version_.major_id = 2; version_.major_id = 2;
version_.minor_id = 3; version_.minor_id = 5;
version_.step_id = 2; version_.step_id = 1;
version_.reserved = 0; version_.reserved = 0;
// Test impact of sleep, temp code
sleep_time_ = 0;
bw_sleep_time_ = getenv("ROCM_BW_SLEEP_TIME");
if (bw_sleep_time_ != NULL) {
sleep_time_ = atoi(bw_sleep_time_);
if ((sleep_time_ < 0) || (sleep_time_ > 400000)) {
std::cout << "Unit of sleep time is defined as 10 microseconds" << std::endl;
std::cout << "An input value of 10 implies sleep time of 100 microseconds" << std::endl;
std::cout << "Value of ROCM_BW_SLEEP_TIME must be between [1, 400000]" << sleep_time_ << std::endl;
exit(1);
}
sleep_time_ *= 10;
std::chrono::microseconds temp(sleep_time_);
sleep_usecs_ = temp;
}
bw_iter_cnt_ = getenv("ROCM_BW_ITER_CNT"); bw_iter_cnt_ = getenv("ROCM_BW_ITER_CNT");
bw_default_run_ = getenv("ROCM_BW_DEFAULT_RUN"); bw_default_run_ = getenv("ROCM_BW_DEFAULT_RUN");
bw_blocking_run_ = getenv("ROCR_BW_RUN_BLOCKING"); bw_blocking_run_ = getenv("ROCR_BW_RUN_BLOCKING");
skip_fine_grain_ = getenv("ROCM_SKIP_FINE_GRAINED_POOL"); skip_cpu_fine_grain_ = getenv("ROCM_SKIP_CPU_FINE_GRAINED_POOL");
skip_gpu_coarse_grain_ = getenv("ROCM_SKIP_GPU_COARSE_GRAINED_POOL");
if (bw_iter_cnt_ != NULL) { if (bw_iter_cnt_ != NULL) {
int32_t num = atoi(bw_iter_cnt_); int32_t num = atoi(bw_iter_cnt_);
if (num < 0) { if (num < 0) {
std::cout << "Value of ROCM_BW_ITER_CNT can't be negative: " << num << std::endl; std::cout << "Value of ROCM_BW_ITER_CNT can't be negative: " << num << std::endl;
exit(1);
} }
set_num_iteration(num); set_num_iteration(num);
} }
......
...@@ -43,12 +43,12 @@ ...@@ -43,12 +43,12 @@
#ifndef __ROC_BANDWIDTH_TEST_H__ #ifndef __ROC_BANDWIDTH_TEST_H__
#define __ROC_BANDWIDTH_TEST_H__ #define __ROC_BANDWIDTH_TEST_H__
#include "hsa/hsa.h" #include "hsa.h"
#include "base_test.hpp" #include "base_test.hpp"
#include "hsatimer.hpp"
#include "common.hpp" #include "common.hpp"
#include <vector> #include <vector>
#include <chrono>
using namespace std; using namespace std;
...@@ -168,13 +168,14 @@ typedef enum Request_Type { ...@@ -168,13 +168,14 @@ typedef enum Request_Type {
REQ_WRITE = 2, REQ_WRITE = 2,
REQ_VERSION = 3, REQ_VERSION = 3,
REQ_TOPOLOGY = 4, REQ_TOPOLOGY = 4,
REQ_COPY_BIDIR = 5, REQ_LIST_DEVS = 5,
REQ_COPY_UNIDIR = 6, REQ_COPY_BIDIR = 6,
REQ_COPY_ALL_BIDIR = 7, REQ_COPY_UNIDIR = 7,
REQ_COPY_ALL_UNIDIR = 8, REQ_COPY_ALL_BIDIR = 8,
REQ_CONCURRENT_COPY_BIDIR = 9, REQ_COPY_ALL_UNIDIR = 9,
REQ_CONCURRENT_COPY_UNIDIR = 10, REQ_CONCURRENT_COPY_BIDIR = 10,
REQ_INVALID = 11, REQ_CONCURRENT_COPY_UNIDIR = 11,
REQ_INVALID = 12,
} Request_Type; } Request_Type;
...@@ -446,6 +447,7 @@ class RocmBandwidthTest : public BaseTest { ...@@ -446,6 +447,7 @@ class RocmBandwidthTest : public BaseTest {
uint32_t req_write_; uint32_t req_write_;
uint32_t req_version_; uint32_t req_version_;
uint32_t req_topology_; uint32_t req_topology_;
uint32_t req_list_devs_;
uint32_t req_copy_bidir_; uint32_t req_copy_bidir_;
uint32_t req_copy_unidir_; uint32_t req_copy_unidir_;
uint32_t req_copy_all_bidir_; uint32_t req_copy_all_bidir_;
...@@ -473,6 +475,9 @@ class RocmBandwidthTest : public BaseTest { ...@@ -473,6 +475,9 @@ class RocmBandwidthTest : public BaseTest {
static const uint32_t LINK_PROP_WEIGHT = 0x02; static const uint32_t LINK_PROP_WEIGHT = 0x02;
static const uint32_t LINK_PROP_ACCESS = 0x03; static const uint32_t LINK_PROP_ACCESS = 0x03;
// Encodes validation failure
static const double VALIDATE_COPY_OP_FAILURE;
// List used to store transactions per user request // List used to store transactions per user request
vector<async_trans_t> trans_list_; vector<async_trans_t> trans_list_;
...@@ -488,7 +493,8 @@ class RocmBandwidthTest : public BaseTest { ...@@ -488,7 +493,8 @@ class RocmBandwidthTest : public BaseTest {
// Env key to determine if Fine-grained or // Env key to determine if Fine-grained or
// Coarse-grained pool should be filtered out // Coarse-grained pool should be filtered out
char* skip_fine_grain_; char* skip_cpu_fine_grain_;
char* skip_gpu_coarse_grain_;
// Env key to determine if the run should block // Env key to determine if the run should block
// or actively wait on completion signal // or actively wait on completion signal
...@@ -499,10 +505,12 @@ class RocmBandwidthTest : public BaseTest { ...@@ -499,10 +505,12 @@ class RocmBandwidthTest : public BaseTest {
// Env key to specify iteration count // Env key to specify iteration count
char* bw_iter_cnt_; char* bw_iter_cnt_;
char* bw_sleep_time_;
// Variable to store argument number uint32_t sleep_time_;
std::chrono::nanoseconds cpu_cp_time_;
// Variable to store argument number std::chrono::microseconds sleep_usecs_;
std::chrono::time_point<std::chrono::steady_clock> cpu_end_;
std::chrono::time_point<std::chrono::steady_clock> cpu_start_;
// Variable to store argument number // Variable to store argument number
uint32_t usr_argc_; uint32_t usr_argc_;
......
File mode changed from 100644 to 100755
...@@ -97,7 +97,7 @@ static bool ParseOptionValue(char* value, vector<size_t>&value_list) { ...@@ -97,7 +97,7 @@ static bool ParseOptionValue(char* value, vector<size_t>&value_list) {
// Read the option value // Read the option value
stream >> token; stream >> token;
if (stream.fail()) { if (stream.fail()) {
exit(-1); return false;
} }
// Update output list with values // Update output list with values
...@@ -147,9 +147,7 @@ void RocmBandwidthTest::ValidateCopyUnidirFlags(uint32_t copy_mask, ...@@ -147,9 +147,7 @@ void RocmBandwidthTest::ValidateCopyUnidirFlags(uint32_t copy_mask,
// It is illegal to specify Latency and another // It is illegal to specify Latency and another
// secondary flag that affects a copy operation // secondary flag that affects a copy operation
if ((copy_ctrl_mask & DEV_COPY_LATENCY) && if ((copy_ctrl_mask & DEV_COPY_LATENCY) &&
((copy_ctrl_mask & USR_BUFFER_INIT) || (copy_ctrl_mask & VALIDATE_COPY_OP)) {
(copy_ctrl_mask & CPU_VISIBLE_TIME) ||
(copy_ctrl_mask & VALIDATE_COPY_OP))) {
PrintHelpScreen(); PrintHelpScreen();
exit(0); exit(0);
} }
...@@ -224,6 +222,12 @@ void RocmBandwidthTest::ValidateInputFlags(uint32_t pf_cnt, ...@@ -224,6 +222,12 @@ void RocmBandwidthTest::ValidateInputFlags(uint32_t pf_cnt,
return; return;
} }
// Input is requesting to print list of devices
// rocm_bandwidth_test -e
if (req_list_devs_ == REQ_LIST_DEVS) {
return;
}
// Input is for bidirectional bandwidth for some devices // Input is for bidirectional bandwidth for some devices
// rocm_bandwidth_test -b // rocm_bandwidth_test -b
if (req_copy_bidir_ == REQ_COPY_BIDIR) { if (req_copy_bidir_ == REQ_COPY_BIDIR) {
...@@ -338,7 +342,7 @@ void RocmBandwidthTest::ParseArguments() { ...@@ -338,7 +342,7 @@ void RocmBandwidthTest::ParseArguments() {
int opt; int opt;
bool status; bool status;
while ((opt = getopt(usr_argc_, usr_argv_, "hqtclvaAb:i:s:d:r:w:m:k:K:")) != -1) { while ((opt = getopt(usr_argc_, usr_argv_, "hqteclvaAb:i:s:d:r:w:m:k:K:")) != -1) {
switch (opt) { switch (opt) {
// Print help screen // Print help screen
...@@ -352,6 +356,12 @@ void RocmBandwidthTest::ParseArguments() { ...@@ -352,6 +356,12 @@ void RocmBandwidthTest::ParseArguments() {
req_version_ = REQ_VERSION; req_version_ = REQ_VERSION;
break; break;
// Print list of devices
case 'e':
num_primary_flags++;
req_list_devs_ = REQ_LIST_DEVS;
break;
// Print system topology // Print system topology
case 't': case 't':
num_primary_flags++; num_primary_flags++;
...@@ -426,6 +436,7 @@ void RocmBandwidthTest::ParseArguments() { ...@@ -426,6 +436,7 @@ void RocmBandwidthTest::ParseArguments() {
status = ParseOptionValue(optarg, size_list_); status = ParseOptionValue(optarg, size_list_);
if (status == false) { if (status == false) {
print_help = true; print_help = true;
break;
} }
copy_ctrl_mask |= USR_BUFFER_SIZE; copy_ctrl_mask |= USR_BUFFER_SIZE;
break; break;
...@@ -510,7 +521,14 @@ void RocmBandwidthTest::ParseArguments() { ...@@ -510,7 +521,14 @@ void RocmBandwidthTest::ParseArguments() {
// Discover the topology of RocR agent in system // Discover the topology of RocR agent in system
DiscoverTopology(); DiscoverTopology();
// Print system topology if user option has "-t" // Print list of devices if user option is "-e"
if (req_list_devs_ == REQ_LIST_DEVS) {
PrintVersion();
PrintTopology();
exit(0);
}
// Print system topology if user option is "-t"
if (req_topology_ == REQ_TOPOLOGY) { if (req_topology_ == REQ_TOPOLOGY) {
PrintVersion(); PrintVersion();
PrintTopology(); PrintTopology();
......
...@@ -59,6 +59,7 @@ void RocmBandwidthTest::PrintHelpScreen() { ...@@ -59,6 +59,7 @@ void RocmBandwidthTest::PrintHelpScreen() {
std::cout << "\t -v Run the test in validation mode" << std::endl; std::cout << "\t -v Run the test in validation mode" << std::endl;
std::cout << "\t -l Run test to collect Latency data" << std::endl; std::cout << "\t -l Run test to collect Latency data" << std::endl;
std::cout << "\t -c Time the operation using CPU Timers" << std::endl; std::cout << "\t -c Time the operation using CPU Timers" << std::endl;
std::cout << "\t -e Prints the list of ROCm devices enabled on platform" << std::endl;
std::cout << "\t -i Initialize copy buffer with specified 'long double' pattern" << std::endl; std::cout << "\t -i Initialize copy buffer with specified 'long double' pattern" << std::endl;
std::cout << "\t -t Prints system topology and allocatable memory info" << std::endl; std::cout << "\t -t Prints system topology and allocatable memory info" << std::endl;
std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl; std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl;
...@@ -70,13 +71,10 @@ void RocmBandwidthTest::PrintHelpScreen() { ...@@ -70,13 +71,10 @@ void RocmBandwidthTest::PrintHelpScreen() {
std::cout << std::endl; std::cout << std::endl;
std::cout << "\t NOTE: Mixing following options is illegal/unsupported" << std::endl; std::cout << "\t NOTE: Mixing following options is illegal/unsupported" << std::endl;
std::cout << "\t\t Case 1: rocm_bandwidth_test -a or -A with -c" << std::endl; std::cout << "\t\t Case 1: rocm_bandwidth_test -a with {lm}{1,}" << std::endl;
std::cout << "\t\t Case 2: rocm_bandwidth_test -b or -A with -m" << std::endl; std::cout << "\t\t Case 2: rocm_bandwidth_test -b with {clv}{1,}" << std::endl;
std::cout << "\t\t Case 3: rocm_bandwidth_test -b or -A with -l" << std::endl; std::cout << "\t\t Case 3: rocm_bandwidth_test -A with {clmv}{1,}" << std::endl;
std::cout << "\t\t Case 4: rocm_bandwidth_test -b or -A with -v" << std::endl; std::cout << "\t\t Case 4: rocm_bandwidth_test -s x -d y with {lmv}{2,}" << std::endl;
std::cout << "\t\t Case 5: rocm_bandwidth_test -a or -s x -d y with -l and -c" << std::endl;
std::cout << "\t\t Case 6: rocm_bandwidth_test -a or -s x -d y with -l and -m" << std::endl;
std::cout << "\t\t Case 7: rocm_bandwidth_test -a or -s x -d y with -l and -v" << std::endl;
std::cout << std::endl; std::cout << std::endl;
std::cout << std::endl; std::cout << std::endl;
...@@ -152,11 +150,19 @@ void RocmBandwidthTest::PrintTopology() { ...@@ -152,11 +150,19 @@ void RocmBandwidthTest::PrintTopology() {
if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) { if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) {
std::cout << " Device Type: CPU" << std::endl; std::cout << " Device Type: CPU" << std::endl;
std::cout.width(format);
std::cout << "";
std::cout.width(format);
std::cout << " Device Name: " << node.agent.name_ << std::endl;
} else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) { } else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) {
std::cout << " Device Type: GPU" << std::endl; std::cout << " Device Type: GPU" << std::endl;
std::cout.width(format); std::cout.width(format);
std::cout << ""; std::cout << "";
std::cout.width(format); std::cout.width(format);
std::cout << " Device Name: " << node.agent.name_ << std::endl;
std::cout.width(format);
std::cout << "";
std::cout.width(format);
std::cout << " Device BDF: " << node.agent.bdf_id_ << std::endl; std::cout << " Device BDF: " << node.agent.bdf_id_ << std::endl;
} }
...@@ -204,6 +210,7 @@ std::string GetValueAsString(uint32_t key, uint32_t value) { ...@@ -204,6 +210,7 @@ std::string GetValueAsString(uint32_t key, uint32_t value) {
} }
std::cout << "An illegal key to get value for" << std::endl; std::cout << "An illegal key to get value for" << std::endl;
assert(false); assert(false);
return "";
} }
void RocmBandwidthTest::PrintLinkPropsMatrix(uint32_t key) const { void RocmBandwidthTest::PrintLinkPropsMatrix(uint32_t key) const {
......
...@@ -61,7 +61,7 @@ static void printRecord(size_t size, double avg_time, ...@@ -61,7 +61,7 @@ static void printRecord(size_t size, double avg_time,
} }
uint32_t format = 15; uint32_t format = 15;
std::cout.precision(6); std::cout.precision(3);
std::cout << std::fixed; std::cout << std::fixed;
std::cout.width(format); std::cout.width(format);
std::cout << size_str.str(); std::cout << size_str.str();
...@@ -134,10 +134,9 @@ double RocmBandwidthTest::GetMeanTime(std::vector<double>& vec) { ...@@ -134,10 +134,9 @@ double RocmBandwidthTest::GetMeanTime(std::vector<double>& vec) {
return vec.at(0); return vec.at(0);
} }
// Number of elements is ONE plus number of iterations
std::sort(vec.begin(), vec.end()); std::sort(vec.begin(), vec.end());
vec.erase(vec.begin()); vec.erase(vec.end() - 1);
vec.erase(vec.begin(), vec.begin() + num_iteration_ * 0.1);
vec.erase(vec.begin() + num_iteration_, vec.end());
double mean = 0.0; double mean = 0.0;
int num = vec.size(); int num = vec.size();
...@@ -287,7 +286,7 @@ void RocmBandwidthTest::PrintPerfMatrix(bool validate, bool peak, double* perf_m ...@@ -287,7 +286,7 @@ void RocmBandwidthTest::PrintPerfMatrix(bool validate, bool peak, double* perf_m
std::cout << std::endl; std::cout << std::endl;
std::cout << std::endl; std::cout << std::endl;
std::cout.precision(6); std::cout.precision(3);
std::cout << std::fixed; std::cout << std::fixed;
std::cout.width(format); std::cout.width(format);
...@@ -318,7 +317,7 @@ void RocmBandwidthTest::PrintPerfMatrix(bool validate, bool peak, double* perf_m ...@@ -318,7 +317,7 @@ void RocmBandwidthTest::PrintPerfMatrix(bool validate, bool peak, double* perf_m
if (validate) { if (validate) {
if (value == 0) { if (value == 0) {
std::cout << "N/A"; std::cout << "N/A";
} else if (value < 1) { } else if (value == VALIDATE_COPY_OP_FAILURE) {
std::cout << "FAIL"; std::cout << "FAIL";
} else { } else {
std::cout << "PASS"; std::cout << "PASS";
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include <iomanip> #include <iomanip>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <cstring>
// @brief: Helper method to iterate throught the memory pools of // @brief: Helper method to iterate throught the memory pools of
// an agent and discover its properties // an agent and discover its properties
...@@ -106,10 +107,11 @@ hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) { ...@@ -106,10 +107,11 @@ hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) {
} }
// Consult user request and add either fine-grained or // Consult user request and add either fine-grained or
// coarse-grained memory pools if agent is CPU // coarse-grained memory pools if agent is CPU. Default
// is to skip coarse-grained memory pools
agent_info_t& agent_info = asyncDrvr->agent_list_.back(); agent_info_t& agent_info = asyncDrvr->agent_list_.back();
if (agent_info.device_type_ == HSA_DEVICE_TYPE_CPU) { if (agent_info.device_type_ == HSA_DEVICE_TYPE_CPU) {
if (asyncDrvr->skip_fine_grain_ != NULL) { if (asyncDrvr->skip_cpu_fine_grain_ != NULL) {
if (is_fine_grained == true) { if (is_fine_grained == true) {
return HSA_STATUS_SUCCESS; return HSA_STATUS_SUCCESS;
} }
...@@ -119,9 +121,21 @@ hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) { ...@@ -119,9 +121,21 @@ hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) {
} }
} }
} }
// hsa_device_type_t device_type;
// status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); // Consult user request and add either fine-grained or
// ErrorCheck(status); // coarse-grained memory pools if agent is GPU. Default
// is to skip fine-grained memory pools
if (agent_info.device_type_ == HSA_DEVICE_TYPE_GPU) {
if (asyncDrvr->skip_gpu_coarse_grain_ != NULL) {
if (is_fine_grained == false) {
return HSA_STATUS_SUCCESS;
}
} else {
if (is_fine_grained == true) {
return HSA_STATUS_SUCCESS;
}
}
}
// Create an instance of agent_pool_info and add it to the list // Create an instance of agent_pool_info and add it to the list
pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool, pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool,
...@@ -145,7 +159,7 @@ void PopulateBDF(uint32_t bdf_id, agent_info_t *agent_info) { ...@@ -145,7 +159,7 @@ void PopulateBDF(uint32_t bdf_id, agent_info_t *agent_info) {
std::stringstream stream; std::stringstream stream;
stream << std::setfill('0') << std::setw(sizeof(uint8_t) * 2); stream << std::setfill('0') << std::setw(sizeof(uint8_t) * 2);
stream << std::hex << +bus_id << ":" << +dev_id << "." << +func_id; stream << std::hex << +bus_id << ":" << +dev_id << "." << +func_id;
strcpy(agent_info->bdf_id_, (stream.str()).c_str()); std::strcpy(agent_info->bdf_id_, (stream.str()).c_str());
} }
// @brief: Helper method to iterate throught the agents of // @brief: Helper method to iterate throught the agents of
...@@ -321,7 +335,7 @@ void RocmBandwidthTest::BindLinkProps(uint32_t idx1, uint32_t idx2) { ...@@ -321,7 +335,7 @@ void RocmBandwidthTest::BindLinkProps(uint32_t idx1, uint32_t idx2) {
hsa_amd_memory_pool_link_info_t *link_info; hsa_amd_memory_pool_link_info_t *link_info;
uint32_t link_info_sz = hops * sizeof(hsa_amd_memory_pool_link_info_t); uint32_t link_info_sz = hops * sizeof(hsa_amd_memory_pool_link_info_t);
link_info = (hsa_amd_memory_pool_link_info_t *)malloc(link_info_sz); link_info = (hsa_amd_memory_pool_link_info_t *)malloc(link_info_sz);
memset(link_info, 0, (hops * sizeof(hsa_amd_memory_pool_link_info_t))); std::memset(link_info, 0, (hops * sizeof(hsa_amd_memory_pool_link_info_t)));
err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool, err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info); HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info);
......
...@@ -130,7 +130,7 @@ bool RocmBandwidthTest::FilterCpuPool(uint32_t req_type, ...@@ -130,7 +130,7 @@ bool RocmBandwidthTest::FilterCpuPool(uint32_t req_type,
// If env to skip fine grain is NULL it means // If env to skip fine grain is NULL it means
// we should filter out coarse-grain pools // we should filter out coarse-grain pools
if (skip_fine_grain_ == NULL) { if (skip_cpu_fine_grain_ == NULL) {
return (fine_grained == false); return (fine_grained == false);
} }
...@@ -418,25 +418,49 @@ void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) { ...@@ -418,25 +418,49 @@ void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) {
data_size += data_size; data_size += data_size;
} }
// Copy operation does not involve a Gpu device // Get time taken by copy operation. Adjust time from nanoseconds
// Divide bandwidth with 10^9 to get size in GigaBytes (10^9) // to units of seconds
if (trans.copy.uses_gpu_ != true) { if ((print_cpu_time_) ||
(trans.copy.uses_gpu_ != true)) {
avg_time = trans.cpu_avg_time_[idx]; avg_time = trans.cpu_avg_time_[idx];
min_time = trans.cpu_min_time_[idx]; min_time = trans.cpu_min_time_[idx];
avg_bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; avg_time = avg_time / 1000 / 1000 / 1000;
peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; min_time = min_time / 1000 / 1000 / 1000;
} else { } else {
if (print_cpu_time_ == false) { avg_time = trans.gpu_avg_time_[idx];
avg_time = trans.gpu_avg_time_[idx] / sys_freq; min_time = trans.gpu_min_time_[idx];
min_time = trans.gpu_min_time_[idx] / sys_freq; }
// Determine if there was a validation failure
// @note: Value is set to VALIDATE_COPY_OP_FAILURE
// if copy transaction wa validated and it failed
hsa_status_t verify_status = HSA_STATUS_ERROR;
if ((avg_time != VALIDATE_COPY_OP_FAILURE) &&
(min_time != VALIDATE_COPY_OP_FAILURE)) {
verify_status = HSA_STATUS_SUCCESS;
}
// Adjust Gpu time if there is no validation error
if ((trans.copy.uses_gpu_) &&
(print_cpu_time_ == false) &&
(verify_status == HSA_STATUS_SUCCESS)) {
avg_time = avg_time / sys_freq;
min_time = min_time / sys_freq;
}
// Compute bandwidth - divide bandwidth with
// 10^9 not 1024^3 to get size in GigaBytes
// @note: For validation failures bandwidth
// is encoded by VALIDATE_COPY_OP_FAILURE
if (verify_status != HSA_STATUS_SUCCESS) {
avg_bandwidth = VALIDATE_COPY_OP_FAILURE;
peak_bandwidth = VALIDATE_COPY_OP_FAILURE;
} else { } else {
avg_time = trans.cpu_avg_time_[idx];
min_time = trans.cpu_min_time_[idx];
}
avg_bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; avg_bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000;
peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000;
} }
// Update computed bandwidth for the transaction
trans.min_time_.push_back(min_time); trans.min_time_.push_back(min_time);
trans.avg_time_.push_back(avg_time); trans.avg_time_.push_back(avg_time);
trans.avg_bandwidth_.push_back(avg_bandwidth); trans.avg_bandwidth_.push_back(avg_bandwidth);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment