Unverified Commit 3f8d00df authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.65 Adding warp-subexecutor support via GFX_SE_TYPE (#212)


Co-authored-by: default avatarWeile <weile.wei@amd.com>
parent 42b7d85b
...@@ -3,6 +3,12 @@ ...@@ -3,6 +3,12 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.65.00
### Added
- Added warp-level dispatch support via GFX_SE_TYPE environment variable
- GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
- GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
## v1.64.00 ## v1.64.00
### Added ### Added
- Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes - Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
......
...@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE) ...@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif() endif()
set(VERSION_STRING "1.64.00") set(VERSION_STRING "1.65.00")
project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX) project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
## Load CMake modules ## Load CMake modules
......
...@@ -89,6 +89,7 @@ public: ...@@ -89,6 +89,7 @@ public:
int gfxBlockSize; // Size of each threadblock (must be multiple of 64) int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask; // Bit-vector representing the CU mask vector<uint32_t> cuMask; // Bit-vector representing the CU mask
vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair
int gfxSeType; // GFX subexecutor type (0=threadblock, 1=warp)
int gfxTemporal; // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both) int gfxTemporal; // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
int gfxUnroll; // GFX-kernel unroll factor int gfxUnroll; // GFX-kernel unroll factor
int useHipEvents; // Use HIP events for timing GFX/DMA Executor int useHipEvents; // Use HIP events for timing GFX/DMA Executor
...@@ -141,6 +142,7 @@ public: ...@@ -141,6 +142,7 @@ public:
fillCompress = GetEnvVarArray("FILL_COMPRESS" , {}); fillCompress = GetEnvVarArray("FILL_COMPRESS" , {});
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0); gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256); gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSeType = GetEnvVar("GFX_SE_TYPE" , 0);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1); gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxTemporal = GetEnvVar("GFX_TEMPORAL" , 0); gfxTemporal = GetEnvVar("GFX_TEMPORAL" , 0);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll); gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
...@@ -320,6 +322,7 @@ public: ...@@ -320,6 +322,7 @@ public:
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n"); printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n"); printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n"); printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" GFX_SE_TYPE - SubExecutor granularity type (0=threadblock, 1=warp)\n");
printf(" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n"); printf(" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL)); printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n"); printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
...@@ -411,6 +414,8 @@ public: ...@@ -411,6 +414,8 @@ public:
"Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved"); "Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
Print("GFX_BLOCK_SIZE", gfxBlockSize, Print("GFX_BLOCK_SIZE", gfxBlockSize,
"Threadblock size of %d", gfxBlockSize); "Threadblock size of %d", gfxBlockSize);
Print("GFX_SE_TYPE", gfxSeType,
"SubExecutor granularity: %s", gfxSeType == 0 ? "Threadblock" : "Warp");
Print("GFX_SINGLE_TEAM", gfxSingleTeam, Print("GFX_SINGLE_TEAM", gfxSingleTeam,
"%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" : "%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
"Each CUs operates on its own disjoint subarray")); "Each CUs operates on its own disjoint subarray"));
...@@ -619,6 +624,7 @@ public: ...@@ -619,6 +624,7 @@ public:
cfg.gfx.blockSize = gfxBlockSize; cfg.gfx.blockSize = gfxBlockSize;
cfg.gfx.cuMask = cuMask; cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable; cfg.gfx.prefXccTable = prefXccTable;
cfg.gfx.seType = gfxSeType;
cfg.gfx.unrollFactor = gfxUnroll; cfg.gfx.unrollFactor = gfxUnroll;
cfg.gfx.temporalMode = gfxTemporal; cfg.gfx.temporalMode = gfxTemporal;
cfg.gfx.useHipEvents = useHipEvents; cfg.gfx.useHipEvents = useHipEvents;
......
...@@ -66,7 +66,7 @@ namespace TransferBench ...@@ -66,7 +66,7 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.64"; constexpr char VERSION[] = "1.65";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
...@@ -180,6 +180,7 @@ namespace TransferBench ...@@ -180,6 +180,7 @@ namespace TransferBench
int blockSize = 256; ///< Size of each threadblock (must be multiple of 64) int blockSize = 256; ///< Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask
vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
int seType = 0; ///< SubExecutor granularity type (0=threadblock, 1=warp)
int temporalMode = 0; ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both int temporalMode = 0; ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
int unrollFactor = 4; ///< GFX-kernel unroll factor int unrollFactor = 4; ///< GFX-kernel unroll factor
int useHipEvents = 1; ///< Use HIP events for timing GFX Executor int useHipEvents = 1; ///< Use HIP events for timing GFX Executor
...@@ -459,6 +460,7 @@ namespace TransferBench ...@@ -459,6 +460,7 @@ namespace TransferBench
// Enumerations // Enumerations
#define hipDeviceAttributeClockRate cudaDevAttrClockRate #define hipDeviceAttributeClockRate cudaDevAttrClockRate
#define hipDeviceAttributeMultiprocessorCount cudaDevAttrMultiProcessorCount #define hipDeviceAttributeMultiprocessorCount cudaDevAttrMultiProcessorCount
#define hipDeviceAttributeWarpSize cudaDevAttrWarpSize
#define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled #define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled
#define hipFuncCachePreferShared cudaFuncCachePreferShared #define hipFuncCachePreferShared cudaFuncCachePreferShared
#define hipMemcpyDefault cudaMemcpyDefault #define hipMemcpyDefault cudaMemcpyDefault
...@@ -608,6 +610,38 @@ namespace { ...@@ -608,6 +610,38 @@ namespace {
int constexpr MEMSET_CHAR = 75; // Value to memset (char) int constexpr MEMSET_CHAR = 75; // Value to memset (char)
float constexpr MEMSET_VAL = 13323083.0f; // Value to memset (double) float constexpr MEMSET_VAL = 13323083.0f; // Value to memset (double)
int GetWarpSize(std::vector<ErrResult>* errors = nullptr) {
int warpSize = 0;
hipError_t err = hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, 0);
if (err == hipSuccess) {
return warpSize;
}
// Query failed, report error and fall back to compile-time default
if (errors) {
errors->push_back({ERR_WARN,
"Failed to query device warp size (hipDeviceGetAttribute error: %d). "
"Falling back to compile-time default", err});
}
#if defined(__NVCC__)
return 32;
#else
return 64;
#endif
}
// Calculate grid Y dimension based on SE_TYPE
int CalculateGridY(int seType, int blockSize, int numSubExecs) {
// Warp-level: each subexecutor is a warp, pack warps into threadblocks
if (seType == 1) {
int warpsPerBlock = blockSize / GetWarpSize();
return (numSubExecs + warpsPerBlock - 1) / warpsPerBlock;
}
// Default: Threadblock-level, each subexecutor is a threadblock
return numSubExecs;
}
// Parsing-related functions // Parsing-related functions
//======================================================================================== //========================================================================================
...@@ -1303,11 +1337,17 @@ namespace { ...@@ -1303,11 +1337,17 @@ namespace {
{ {
// Check total number of subexecutors requested // Check total number of subexecutors requested
int numGpuSubExec = GetNumSubExecutors(exeDevice); int numGpuSubExec = GetNumSubExecutors(exeDevice);
// For warp-level dispatch, multiply by warps per threadblock
if (cfg.gfx.seType == 1) {
int warpsPerBlock = cfg.gfx.blockSize / GetWarpSize(&errors);
numGpuSubExec *= warpsPerBlock;
}
if (totalSubExecs[exeDevice] > numGpuSubExec) if (totalSubExecs[exeDevice] > numGpuSubExec)
errors.push_back({ERR_WARN, errors.push_back({ERR_WARN,
"GPU %d requests %d total CUs however only %d available. " "GPU %d requests %d total %s however only %d available. "
"Serialization will occur", "Serialization will occur",
exeDevice.exeIndex, totalSubExecs[exeDevice], numGpuSubExec}); exeDevice.exeIndex, totalSubExecs[exeDevice],
cfg.gfx.seType == 0 ? "CUs" : "warps", numGpuSubExec});
// Check that if executor subindices are used, all Transfers specify executor subindices // Check that if executor subindices are used, all Transfers specify executor subindices
if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) { if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
...@@ -2977,12 +3017,29 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2977,12 +3017,29 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Kernel for GFX execution // Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE> template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE) __global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations) GpuReduceKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations)
{ {
int64_t startCycle; int64_t startCycle;
if (threadIdx.x == 0) startCycle = GetTimestamp(); // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
bool shouldRecordTiming = (seType == 1) ? (threadIdx.x % warpSize == 0) : (threadIdx.x == 0);
if (shouldRecordTiming) startCycle = GetTimestamp();
// seType: 0=threadblock, 1=warp
int subExecIdx;
if (seType == 0) {
// Threadblock-level: each threadblock is a subexecutor
subExecIdx = blockIdx.y;
} else {
// Warp-level: each warp is a subexecutor
int warpIdx = threadIdx.x / warpSize;
int warpsPerBlock = BLOCKSIZE / warpSize;
subExecIdx = blockIdx.y * warpsPerBlock + warpIdx;
}
SubExecParam& p = params[subExecIdx];
SubExecParam& p = params[blockIdx.y]; // For warp-level dispatch, inactive warps should return early
if (seType == 1 && p.N == 0) return;
// Filter by XCC // Filter by XCC
#if !defined(__NVCC__) #if !defined(__NVCC__)
...@@ -3002,8 +3059,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3002,8 +3059,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Operate on wavefront granularity // Operate on wavefront granularity
int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray
int32_t const teamIdx = p.teamIdx; // Index of this threadblock within the team int32_t const teamIdx = p.teamIdx; // Index of this threadblock within the team
int32_t const nWaves = BLOCKSIZE / warpSize; // Number of wavefronts within this threadblock int32_t nWaves, waveIdx;
int32_t const waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock if (seType == 0) {
// Threadblock-level: all wavefronts in block work together
nWaves = BLOCKSIZE / warpSize; // Number of wavefronts within this threadblock
waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
} else {
// Warp-level: each warp works independently
nWaves = 1;
waveIdx = 0;
}
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
size_t const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float)); size_t const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float));
...@@ -3106,8 +3171,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3106,8 +3171,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
// Wait for all threads to finish // Wait for all threads to finish
if (seType == 1) {
// For warp-level, sync within warp only
__syncwarp();
} else {
// For threadblock-level, sync all threads
__syncthreads(); __syncthreads();
if (threadIdx.x == 0) { }
if (shouldRecordTiming) {
__threadfence_system(); __threadfence_system();
p.stopCycle = GetTimestamp(); p.stopCycle = GetTimestamp();
p.startCycle = startCycle; p.startCycle = startCycle;
...@@ -3137,8 +3209,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3137,8 +3209,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), \ GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)} GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)}
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size) // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal)
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int); typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int, int);
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] = GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] =
{ {
GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(64),
...@@ -3161,6 +3233,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3161,6 +3233,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#undef GPU_KERNEL_UNROLL_DECL #undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL #undef GPU_KERNEL_DWORD_DECL
#undef GPU_KERNEL_TEMPORAL_DECL #undef GPU_KERNEL_TEMPORAL_DECL
#undef GPU_KERNEL_SE_TYPE_DECL
// Execute a single GPU Transfer (when using 1 stream per Transfer) // Execute a single GPU Transfer (when using 1 stream per Transfer)
static ErrResult ExecuteGpuTransfer(int const iteration, static ErrResult ExecuteGpuTransfer(int const iteration,
...@@ -3174,7 +3247,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3174,7 +3247,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
auto cpuStart = std::chrono::high_resolution_clock::now(); auto cpuStart = std::chrono::high_resolution_clock::now();
int numSubExecs = rss.subExecParamCpu.size(); int numSubExecs = rss.subExecParamCpu.size();
dim3 const gridSize(xccDim, numSubExecs, 1); int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs);
dim3 const gridSize(xccDim, gridY, 1);
dim3 const blockSize(cfg.gfx.blockSize, 1); dim3 const blockSize(cfg.gfx.blockSize, 1);
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
...@@ -3185,12 +3259,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3185,12 +3259,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#if defined(__NVCC__) #if defined(__NVCC__)
if (startEvent != NULL) if (startEvent != NULL)
ERR_CHECK(hipEventRecord(startEvent, stream)); ERR_CHECK(hipEventRecord(startEvent, stream));
gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations); gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (stopEvent != NULL) if (stopEvent != NULL)
ERR_CHECK(hipEventRecord(stopEvent, stream)); ERR_CHECK(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent, hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent,
0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations); 0, rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
#endif #endif
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
...@@ -3249,7 +3323,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3249,7 +3323,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} else { } else {
// Combine all the Transfers into a single kernel launch // Combine all the Transfers into a single kernel launch
int numSubExecs = exeInfo.totalSubExecs; int numSubExecs = exeInfo.totalSubExecs;
dim3 const gridSize(xccDim, numSubExecs, 1); int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs);
dim3 const gridSize(xccDim, gridY, 1);
dim3 const blockSize(cfg.gfx.blockSize, 1); dim3 const blockSize(cfg.gfx.blockSize, 1);
hipStream_t stream = exeInfo.streams[0]; hipStream_t stream = exeInfo.streams[0];
...@@ -3261,14 +3336,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3261,14 +3336,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#if defined(__NVCC__) #if defined(__NVCC__)
if (cfg.gfx.useHipEvents) if (cfg.gfx.useHipEvents)
ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream)); ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations); gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (cfg.gfx.useHipEvents) if (cfg.gfx.useHipEvents)
ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream)); ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
#else #else
hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL, cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0, cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations); exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
#endif #endif
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
} }
...@@ -4026,6 +4101,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -4026,6 +4101,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#undef hipDeviceAttributeClockRate #undef hipDeviceAttributeClockRate
#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor #undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
#undef hipDeviceAttributeMultiprocessorCount #undef hipDeviceAttributeMultiprocessorCount
#undef hipDeviceAttributeWarpSize
#undef hipErrorPeerAccessAlreadyEnabled #undef hipErrorPeerAccessAlreadyEnabled
#undef hipFuncCachePreferShared #undef hipFuncCachePreferShared
#undef hipMemcpyDefault #undef hipMemcpyDefault
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment