Unverified Commit 47480a59 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.47 Fixing CUDA compilation (#83)

parent e9f51f2b
...@@ -3,6 +3,11 @@ ...@@ -3,6 +3,11 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.47
### Fixes
* Fixing CUDA support
## v1.46 ## v1.46
### Fixes ### Fixes
......
...@@ -13,7 +13,7 @@ else ...@@ -13,7 +13,7 @@ else
endif endif
CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75 NVFLAGS = -O3 -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75
LDFLAGS += -lpthread LDFLAGS += -lpthread
all: $(EXE) all: $(EXE)
......
...@@ -1459,16 +1459,17 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1459,16 +1459,17 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Otherwise, just launch the threadblocks associated with this single Transfer // Otherwise, just launch the threadblocks associated with this single Transfer
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs; int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1); int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
dim3 const gridSize(numXCCs, numBlocksToRun, 1);
dim3 const blockSize(ev.gfxBlockSize, 1, 1);
#if defined(__NVCC__) #if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream)); HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1] GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
<<<numBlocksToRun, ev.gfxBlockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.waveOrder); <<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
HIP_CALL(hipEventRecord(stopEvent, stream)); HIP_CALL(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1], hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
dim3(numXCCs, numBlocksToRun, 1), gridSize, blockSize,
dim3(ev.gfxBlockSize, 1, 1),
ev.sharedMemBytes, stream, ev.sharedMemBytes, stream,
startEvent, stopEvent, startEvent, stopEvent,
0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder); 0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
...@@ -1994,9 +1995,9 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1994,9 +1995,9 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
if (ev.a2aDirect) if (ev.a2aDirect)
{ {
#if !defined(__NVCC__)
if (i == j) continue; if (i == j) continue;
#if !defined(__NVCC__)
uint32_t linkType, hopCount; uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false), HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
RemappedIndex(j, false), RemappedIndex(j, false),
...@@ -2460,7 +2461,6 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int ...@@ -2460,7 +2461,6 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus) void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
{ {
char memType = ev.useFineGrain ? 'F' : 'G';
printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus); printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' '); char sep = (ev.outputToCsv ? ',' : ' ');
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.46" #define TB_VERSION "1.47"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -530,8 +530,7 @@ public: ...@@ -530,8 +530,7 @@ public:
for (int i = 0; i < numDetectedGpus; i++) for (int i = 0; i < numDetectedGpus; i++)
{ {
#if defined(__NVCC__) #if defined(__NVCC__)
// NOTE: wallClock doesn't exist in CUDA. This may need to be adjusted / run with fixed clocks wallClockPerDeviceMhz[i] = 1000000;
wallClockPerDeviceMhz[i] = 1410000;
#else #else
hipDeviceProp_t prop; hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i)); HIP_CALL(hipGetDeviceProperties(&prop, i));
......
...@@ -29,6 +29,10 @@ THE SOFTWARE. ...@@ -29,6 +29,10 @@ THE SOFTWARE.
#define MEMSET_VAL 13323083.0f #define MEMSET_VAL 13323083.0f
#if defined(__NVCC__)
#define warpSize 32
#endif
#define MAX_WAVEGROUPS MAX_BLOCKSIZE / warpSize #define MAX_WAVEGROUPS MAX_BLOCKSIZE / warpSize
#define MAX_UNROLL 8 #define MAX_UNROLL 8
#define NUM_WAVEORDERS 6 #define NUM_WAVEORDERS 6
...@@ -44,7 +48,7 @@ struct SubExecParam ...@@ -44,7 +48,7 @@ struct SubExecParam
int numDsts; // Number of destination arrays int numDsts; // Number of destination arrays
float* src[MAX_SRCS]; // Source array pointers float* src[MAX_SRCS]; // Source array pointers
float* dst[MAX_DSTS]; // Destination array pointers float* dst[MAX_DSTS]; // Destination array pointers
uint32_t preferredXccId; // XCC ID to execute on int32_t preferredXccId; // XCC ID to execute on
// Prepared // Prepared
int teamSize; // Index of this sub executor amongst team int teamSize; // Index of this sub executor amongst team
...@@ -133,6 +137,17 @@ PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx) ...@@ -133,6 +137,17 @@ PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
} }
} }
__device__ int64_t GetTimestamp()
{
#if defined(__NVCC__)
int64_t result;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
return result;
#else
return wall_clock64();
#endif
}
// Helper function for memset // Helper function for memset
template <typename T> __device__ __forceinline__ T MemsetVal(); template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; }; template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
...@@ -143,14 +158,16 @@ __global__ void __launch_bounds__(BLOCKSIZE) ...@@ -143,14 +158,16 @@ __global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder) GpuReduceKernel(SubExecParam* params, int waveOrder)
{ {
int64_t startCycle; int64_t startCycle;
if (threadIdx.x == 0) startCycle = wall_clock64(); if (threadIdx.x == 0) startCycle = GetTimestamp();
SubExecParam& p = params[blockIdx.y]; SubExecParam& p = params[blockIdx.y];
// (Experimental) Filter by XCC if desired // (Experimental) Filter by XCC if desired
#if !defined(__NVCC__)
int32_t xccId; int32_t xccId;
GetXccId(xccId); GetXccId(xccId);
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return; if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
#endif
// Collect data information // Collect data information
int32_t const numSrcs = p.numSrcs; int32_t const numSrcs = p.numSrcs;
...@@ -168,7 +185,6 @@ __global__ void __launch_bounds__(BLOCKSIZE) ...@@ -168,7 +185,6 @@ __global__ void __launch_bounds__(BLOCKSIZE)
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
size_t const numFloat4 = p.N / 4; size_t const numFloat4 = p.N / 4;
int32_t const nFlt4PerWave = warpSize * 4;
int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2; int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
switch (waveOrder) switch (waveOrder)
...@@ -266,7 +282,7 @@ __global__ void __launch_bounds__(BLOCKSIZE) ...@@ -266,7 +282,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
__threadfence_system(); __threadfence_system();
p.stopCycle = wall_clock64(); p.stopCycle = GetTimestamp();
p.startCycle = startCycle; p.startCycle = startCycle;
p.xccId = xccId; p.xccId = xccId;
__trace_hwreg(); __trace_hwreg();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment