Unverified Commit 9f68d14d authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Fixing compile error on CentOS and multi-stream GFX mode (#140)

parent 02ce785c
...@@ -3,6 +3,11 @@ ...@@ -3,6 +3,11 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.55
### Fixed
- Fixed missing header error when compiling on CentOS
- Fixed issues when using multi-stream mode for GFX executor
## v1.54 ## v1.54
### Modified ### Modified
- Refactored TransferBench into a header-only library combined with a thin client to facilitate the - Refactored TransferBench into a header-only library combined with a thin client to facilitate the
......
...@@ -7,7 +7,7 @@ else() ...@@ -7,7 +7,7 @@ else()
endif() endif()
cmake_minimum_required(VERSION 3.5) cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.54.0 LANGUAGES CXX) project(TransferBench VERSION 1.55.0 LANGUAGES CXX)
# Default GPU architectures to build # Default GPU architectures to build
#================================================================================================== #==================================================================================================
......
...@@ -18,7 +18,7 @@ endif ...@@ -18,7 +18,7 @@ endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -x cu -lnuma -arch=native NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 --std=c++20 -I./src/header -I./src/client -I./src/client/Presets COMMON_FLAGS = -g -O3 --std=c++20 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread LDFLAGS += -lpthread
all: $(EXE) all: $(EXE)
......
...@@ -23,7 +23,7 @@ THE SOFTWARE. ...@@ -23,7 +23,7 @@ THE SOFTWARE.
#pragma once #pragma once
// TransferBench client version // TransferBench client version
#define CLIENT_VERSION "1.54.00" #define CLIENT_VERSION "1.55.00"
#include "TransferBench.hpp" #include "TransferBench.hpp"
#include "EnvVars.hpp" #include "EnvVars.hpp"
......
...@@ -30,6 +30,7 @@ THE SOFTWARE. ...@@ -30,6 +30,7 @@ THE SOFTWARE.
#include <sstream> #include <sstream>
#include <stdarg.h> #include <stdarg.h>
#include <thread> #include <thread>
#include <unistd.h>
#include <vector> #include <vector>
#if defined(__NVCC__) #if defined(__NVCC__)
...@@ -1416,7 +1417,6 @@ namespace { ...@@ -1416,7 +1417,6 @@ namespace {
int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA || int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
(exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream)) (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
? exeInfo.resources.size() : 1; ? exeInfo.resources.size() : 1;
exeInfo.streams.resize(numStreamsToUse); exeInfo.streams.resize(numStreamsToUse);
// Create streams // Create streams
...@@ -1833,6 +1833,8 @@ namespace { ...@@ -1833,6 +1833,8 @@ namespace {
// Execute a single GPU Transfer (when using 1 stream per Transfer) // Execute a single GPU Transfer (when using 1 stream per Transfer)
static ErrResult ExecuteGpuTransfer(int const iteration, static ErrResult ExecuteGpuTransfer(int const iteration,
hipStream_t const stream, hipStream_t const stream,
hipEvent_t const startEvent,
hipEvent_t const stopEvent,
int const xccDim, int const xccDim,
ConfigOptions const& cfg, ConfigOptions const& cfg,
TransferResources& resources) TransferResources& resources)
...@@ -1844,33 +1846,41 @@ namespace { ...@@ -1844,33 +1846,41 @@ namespace {
dim3 const blockSize(cfg.gfx.blockSize, 1); dim3 const blockSize(cfg.gfx.blockSize, 1);
#if defined(__NVCC__) #if defined(__NVCC__)
if (startEvent != NULL)
ERR_CHECK(hipEventRecord(startEvent, stream));
GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1] GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
<<<gridSize, blockSize, 0, stream>>> <<<gridSize, blockSize, 0, stream>>>
(resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations); (resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (stopEvent != NULL)
ERR_CHECK(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1], hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
gridSize, blockSize, 0, stream, gridSize, blockSize, 0, stream, startEvent, stopEvent,
NULL, NULL,
0, resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations); 0, resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
#endif #endif
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) { if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec;
if (startEvent != NULL) {
float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec;
}
resources.totalDurationMsec += deltaMsec; resources.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
resources.perIterMsec.push_back(deltaMsec); resources.perIterMsec.push_back(deltaMsec);
#if !defined(__NVCC__)
std::set<std::pair<int,int>> CUs; std::set<std::pair<int,int>> CUs;
for (int i = 0; i < numSubExecs; i++) { for (int i = 0; i < numSubExecs; i++) {
CUs.insert(std::make_pair(resources.subExecParamGpuPtr[i].xccId, CUs.insert(std::make_pair(resources.subExecParamGpuPtr[i].xccId,
GetId(resources.subExecParamGpuPtr[i].hwId))); GetId(resources.subExecParamGpuPtr[i].hwId)));
} }
resources.perIterCUs.push_back(CUs); resources.perIterCUs.push_back(CUs);
#endif
} }
} }
return ERR_NONE; return ERR_NONE;
...@@ -1895,6 +1905,8 @@ namespace { ...@@ -1895,6 +1905,8 @@ namespace {
ExecuteGpuTransfer, ExecuteGpuTransfer,
iteration, iteration,
exeInfo.streams[i], exeInfo.streams[i],
cfg.gfx.useHipEvents ? exeInfo.startEvents[i] : NULL,
cfg.gfx.useHipEvents ? exeInfo.stopEvents[i] : NULL,
xccDim, xccDim,
std::cref(cfg), std::cref(cfg),
std::ref(exeInfo.resources[i]))); std::ref(exeInfo.resources[i])));
...@@ -1931,7 +1943,7 @@ namespace { ...@@ -1931,7 +1943,7 @@ namespace {
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) { if (iteration >= 0) {
if (cfg.gfx.useHipEvents) { if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
float gpuDeltaMsec; float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0])); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
exeInfo.totalDurationMsec += gpuDeltaMsec; exeInfo.totalDurationMsec += gpuDeltaMsec;
...@@ -1940,34 +1952,31 @@ namespace { ...@@ -1940,34 +1952,31 @@ namespace {
} }
// Determine timing for each of the individual transfers that were part of this launch // Determine timing for each of the individual transfers that were part of this launch
for (int i = 0; i < exeInfo.resources.size(); i++) { if (!cfg.gfx.useMultiStream) {
TransferResources& resources = exeInfo.resources[i]; for (int i = 0; i < exeInfo.resources.size(); i++) {
long long minStartCycle = std::numeric_limits<long long>::max(); TransferResources& resources = exeInfo.resources[i];
long long maxStopCycle = std::numeric_limits<long long>::min(); long long minStartCycle = std::numeric_limits<long long>::max();
std::set<std::pair<int, int>> CUs; long long maxStopCycle = std::numeric_limits<long long>::min();
std::set<std::pair<int, int>> CUs;
for (auto subExecIdx : resources.subExecIdx) {
minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle); for (auto subExecIdx : resources.subExecIdx) {
maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle); minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
if (cfg.general.recordPerIteration) { maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle);
#if !defined(__NVCC__) if (cfg.general.recordPerIteration) {
CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId, CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
GetId(exeInfo.subExecParamGpu[subExecIdx].hwId))); GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
#endif }
} }
} double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
resources.totalDurationMsec += deltaMsec; resources.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
resources.perIterMsec.push_back(deltaMsec); resources.perIterMsec.push_back(deltaMsec);
#if !defined(__NVCC__) resources.perIterCUs.push_back(CUs);
resources.perIterCUs.push_back(CUs); }
#endif
} }
} }
} }
return ERR_NONE; return ERR_NONE;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment