Unverified Commit 47480a59 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.47 Fixing CUDA compilation (#83)

parent e9f51f2b
......@@ -3,6 +3,11 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.47
### Fixes
* Fixing CUDA support
## v1.46
### Fixes
......
......@@ -13,7 +13,7 @@ else
endif
CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75
NVFLAGS = -O3 -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75
LDFLAGS += -lpthread
all: $(EXE)
......
......@@ -1459,16 +1459,17 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Otherwise, just launch the threadblocks associated with this single Transfer
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
dim3 const gridSize(numXCCs, numBlocksToRun, 1);
dim3 const blockSize(ev.gfxBlockSize, 1, 1);
#if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1]
<<<numBlocksToRun, ev.gfxBlockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.waveOrder);
GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
<<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
HIP_CALL(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1],
dim3(numXCCs, numBlocksToRun, 1),
dim3(ev.gfxBlockSize, 1, 1),
hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
gridSize, blockSize,
ev.sharedMemBytes, stream,
startEvent, stopEvent,
0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
......@@ -1994,9 +1995,9 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
if (ev.a2aDirect)
{
#if !defined(__NVCC__)
if (i == j) continue;
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
RemappedIndex(j, false),
......@@ -2460,7 +2461,6 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
{
char memType = ev.useFineGrain ? 'F' : 'G';
printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' ');
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.46"
#define TB_VERSION "1.47"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -530,8 +530,7 @@ public:
for (int i = 0; i < numDetectedGpus; i++)
{
#if defined(__NVCC__)
// NOTE: wallClock doesn't exist in CUDA. This may need to be adjusted / run with fixed clocks
wallClockPerDeviceMhz[i] = 1410000;
wallClockPerDeviceMhz[i] = 1000000;
#else
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i));
......
......@@ -29,6 +29,10 @@ THE SOFTWARE.
#define MEMSET_VAL 13323083.0f
#if defined(__NVCC__)
#define warpSize 32
#endif
#define MAX_WAVEGROUPS MAX_BLOCKSIZE / warpSize
#define MAX_UNROLL 8
#define NUM_WAVEORDERS 6
......@@ -44,7 +48,7 @@ struct SubExecParam
int numDsts; // Number of destination arrays
float* src[MAX_SRCS]; // Source array pointers
float* dst[MAX_DSTS]; // Destination array pointers
uint32_t preferredXccId; // XCC ID to execute on
int32_t preferredXccId; // XCC ID to execute on
// Prepared
int teamSize; // Index of this sub executor amongst team
......@@ -133,6 +137,17 @@ PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
}
}
__device__ int64_t GetTimestamp()
{
#if defined(__NVCC__)
int64_t result;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
return result;
#else
return wall_clock64();
#endif
}
// Helper function for memset
template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
......@@ -143,14 +158,16 @@ __global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder)
{
int64_t startCycle;
if (threadIdx.x == 0) startCycle = wall_clock64();
if (threadIdx.x == 0) startCycle = GetTimestamp();
SubExecParam& p = params[blockIdx.y];
// (Experimental) Filter by XCC if desired
#if !defined(__NVCC__)
int32_t xccId;
GetXccId(xccId);
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
#endif
// Collect data information
int32_t const numSrcs = p.numSrcs;
......@@ -168,7 +185,6 @@ __global__ void __launch_bounds__(BLOCKSIZE)
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
size_t const numFloat4 = p.N / 4;
int32_t const nFlt4PerWave = warpSize * 4;
int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
switch (waveOrder)
......@@ -266,7 +282,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
if (threadIdx.x == 0)
{
__threadfence_system();
p.stopCycle = wall_clock64();
p.stopCycle = GetTimestamp();
p.startCycle = startCycle;
p.xccId = xccId;
__trace_hwreg();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment