Unverified Commit cc0e9cb4 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.11 (#9)

* Adding MIMO support, DMA executor, Null memory type
parent 3b47b874
# Changelog for TransferBench
## v1.11
### Added
- New multi-input / multi-output support (MIMO). Transfers now can reduce (element-wise summation) multiple input memory arrays
and write the sums to multiple outputs
- New GPU-DMA executor 'D' (uses hipMemcpy for SDMA copies). Previously this was done using USE_HIP_CALL, but now this allows
GPU-GFX kernel to run in parallel with GPU-DMA instead of applying to all GPU executors globally.
- GPU-DMA executor can only be used for single-input/single-output Transfers
- GPU-DMA executor can only be associated with one SubExecutor
- Added new "Null" memory type 'N', which represents empty memory. This allows for read-only or write-only Transfers
- Added new GPU_KERNEL environment variable that allows for switching between various GPU-GFX reduction kernels
### Optimized
- Slightly improved GPU-GFX kernel performance based on hardware architecture when running with fewer CUs
### Changed
- Updated the example.cfg file to cover the new features
- Updated output to support MIMO
- Changed CUs/CPUs threads naming to SubExecutors for consistency
- Sweep Preset:
- Default sweep preset executors now includes DMA
- P2P Benchmarks:
- Now only works via "p2p". Removed "p2p_rr", "g2g" and "g2g_rr".
- Setting NUM_CPU_DEVICES=0 can be used to only benchmark GPU devices (like "g2g")
- New environment variable USE_REMOTE_READ replaces "_rr" presets
- New environment variable USE_GPU_DMA=1 replaces USE_HIP_CALL=1 for benchmarking with GPU-DMA Executor
- Number of GPU SubExecutors for benchmark can be specified via NUM_GPU_SE
- Defaults to all CUs for GPU-GFX, 1 for GPU-DMA
- Number of CPU SubExecutors for benchmark can be specified via NUM_CPU_SE
- Psuedo-random input pattern has been slightly adjusted to have different patterns for each input array within same Transfer
### Removed
- USE_HIP_CALL has been removed. Use GPU-DMA executor 'D' or set USE_GPU_DMA=1 for P2P benchmark presets
- Currently warning will be issued if USE_HIP_CALL is set to 1 and program will terminate
- Removed NUM_CPU_PER_TRANSFER - The number of CPU SubExecutors will be whatever is specified for the Transfer
- Removed USE_MEMSET environment variable. This can now be done via a Transfer using the null memory type
## v1.10
### Fixed
- Fix incorrect bandwidth calculation when using single stream mode and per-Transfer data sizes
......
This diff is collapsed.
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -22,66 +22,145 @@ THE SOFTWARE.
#pragma once
#define PackedFloat_t float4
#define WARP_SIZE 64
#define BLOCKSIZE 256
#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
#define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f
// GPU copy kernel
__global__ void __launch_bounds__(BLOCKSIZE)
GpuCopyKernel(BlockParam* blockParams)
// Each subExecutor is provided with subarrays to work on
#define MAX_SRCS 16
#define MAX_DSTS 16
struct SubExecParam
{
#define PackedFloat_t float4
#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
size_t N; // Number of floats this subExecutor works on
int numSrcs; // Number of source arrays
int numDsts; // Number of destination arrays
float* src[MAX_SRCS]; // Source array pointers
float* dst[MAX_DSTS]; // Destination array pointers
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
};
// Collect the arguments for this threadblock
int Nrem = blockParams[blockIdx.x].N;
float const* src = blockParams[blockIdx.x].src;
float* dst = blockParams[blockIdx.x].dst;
if (threadIdx.x == 0) blockParams[blockIdx.x].startCycle = __builtin_amdgcn_s_memrealtime();
void CpuReduceKernel(SubExecParam const& p)
{
int const& numSrcs = p.numSrcs;
int const& numDsts = p.numDsts;
if (numSrcs == 0)
{
for (int i = 0; i < numDsts; ++i)
memset((float* __restrict__)p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
}
else if (numSrcs == 1)
{
float const* __restrict__ src = p.src[0];
for (int i = 0; i < numDsts; ++i)
{
memcpy((float* __restrict__)p.dst[i], src, p.N * sizeof(float));
}
}
else
{
for (int j = 0; j < p.N; j++)
{
float sum = p.src[0][j];
for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
}
}
}
// Helper function for memset
template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
template <int LOOP1_UNROLL>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params)
{
int64_t startCycle = __builtin_amdgcn_s_memrealtime();
// Operate on wavefront granularity
int numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
int waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
SubExecParam& p = params[blockIdx.x];
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
int const numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
#define LOOP1_UNROLL 8
// 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
// Determine the number of packed floats processed by the first loop
int const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
int const loop1Nelem = loop1Npack * FLOATS_PER_PACK;
int const loop1Inc = BLOCKSIZE * LOOP1_UNROLL;
int loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
size_t Nrem = p.N;
size_t const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
size_t const loop1Nelem = loop1Npack * FLOATS_PER_PACK;
size_t const loop1Inc = BLOCKSIZE * LOOP1_UNROLL;
size_t loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
PackedFloat_t const* packedSrc = (PackedFloat_t const*)(src) + loop1Offset;
PackedFloat_t* packedDst = (PackedFloat_t *)(dst) + loop1Offset;
while (loop1Offset < loop1Npack)
{
PackedFloat_t vals[LOOP1_UNROLL];
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u)
vals[u] = *(packedSrc + u * WARP_SIZE);
PackedFloat_t vals[LOOP1_UNROLL] = {};
if (numSrcs == 0)
{
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
}
else
{
for (int i = 0; i < numSrcs; ++i)
{
PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u)
*(packedDst + u * WARP_SIZE) = vals[u];
vals[u] += *(packedSrc + u * WARP_SIZE);
}
}
packedSrc += loop1Inc;
packedDst += loop1Inc;
for (int i = 0; i < numDsts; ++i)
{
PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
}
loop1Offset += loop1Inc;
}
Nrem -= loop1Nelem;
if (Nrem > 0)
{
// 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
int const loop2Npack = Nrem / FLOATS_PER_PACK;
int const loop2Nelem = loop2Npack * FLOATS_PER_PACK;
int const loop2Inc = BLOCKSIZE;
int loop2Offset = threadIdx.x;
// NOTE: Using int32_t due to smaller size requirements
int32_t const loop2Npack = Nrem / FLOATS_PER_PACK;
int32_t const loop2Nelem = loop2Npack * FLOATS_PER_PACK;
int32_t const loop2Inc = BLOCKSIZE;
int32_t loop2Offset = threadIdx.x;
packedSrc = (PackedFloat_t const*)(src + loop1Nelem);
packedDst = (PackedFloat_t *)(dst + loop1Nelem);
while (loop2Offset < loop2Npack)
{
packedDst[loop2Offset] = packedSrc[loop2Offset];
PackedFloat_t val;
if (numSrcs == 0)
{
val = MemsetVal<float4>();
}
else
{
val = {};
for (int i = 0; i < numSrcs; ++i)
{
PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
val += *packedSrc;
}
}
for (int i = 0; i < numDsts; ++i)
{
PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
*packedDst = val;
}
loop2Offset += loop2Inc;
}
Nrem -= loop2Nelem;
......@@ -90,40 +169,221 @@ GpuCopyKernel(BlockParam* blockParams)
if (threadIdx.x < Nrem)
{
int offset = loop1Nelem + loop2Nelem + threadIdx.x;
dst[offset] = src[offset];
float val = 0;
if (numSrcs == 0)
{
val = MEMSET_VAL;
}
else
{
for (int i = 0; i < numSrcs; ++i)
val += ((float const* __restrict__)p.src[i])[offset];
}
for (int i = 0; i < numDsts; ++i)
((float* __restrict__)p.dst[i])[offset] = val;
}
}
__threadfence_system();
__syncthreads();
if (threadIdx.x == 0)
blockParams[blockIdx.x].stopCycle = __builtin_amdgcn_s_memrealtime();
{
p.startCycle = startCycle;
p.stopCycle = __builtin_amdgcn_s_memrealtime();
}
}
#define MEMSET_UNROLL 8
__global__ void __launch_bounds__(BLOCKSIZE)
GpuMemsetKernel(BlockParam* blockParams)
template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
{
// Collect the arguments for this block
int N = blockParams[blockIdx.x].N;
float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
int constexpr numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
size_t constexpr loopPackInc = BLOCKSIZE * UNROLL_FACTOR;
size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
size_t const numPacksDone = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
size_t const numFloatsLeft = N - numPacksDone * numFloatsPerPack;
size_t loopPackOffset = waveId * numPacksPerWave + threadId;
while (loopPackOffset < numPacksDone)
{
FLOAT_TYPE vals[UNROLL_FACTOR];
if (numSrcs == 0)
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
}
else
{
FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
vals[u] = *(src0Ptr + u * WARP_SIZE);
// Use non-zero value
#pragma unroll MEMSET_UNROLL
for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
for (int i = 1; i < numSrcs; ++i)
{
dst[tid] = 1234.0;
FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
vals[u] += *(srcPtr + u * WARP_SIZE);
}
}
for (int i = 0; i < numDsts; ++i)
{
FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
*(dstPtr + u * WARP_SIZE) = vals[u];
}
loopPackOffset += loopPackInc;
}
return numFloatsLeft;
}
// CPU copy kernel
void CpuCopyKernel(BlockParam const& blockParams)
template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
{
memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float));
// Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
int constexpr numFloatsPerRead = sizeof(FLOAT_TYPE) / sizeof(float);
size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead;
size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
size_t const numFloatsLeft = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
size_t const numFloatsDone = N - numFloatsLeft;
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
{
FLOAT_TYPE tmp[UNROLL_FACTOR];
if (numSrcs == 0)
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
tmp[u] = MemsetVal<FLOAT_TYPE>();
}
else
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
for (int i = 1; i < numSrcs; ++i)
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
}
}
for (int i = 0; i < numDsts; ++i)
{
for (int u = 0; u < UNROLL_FACTOR; ++u)
{
*((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
}
}
}
return numFloatsLeft;
}
template <typename FLOAT_TYPE>
__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
{
switch (unroll)
{
case 1: return GpuReduceFuncImpl<FLOAT_TYPE, 1>(p, offset, N);
case 2: return GpuReduceFuncImpl<FLOAT_TYPE, 2>(p, offset, N);
case 3: return GpuReduceFuncImpl<FLOAT_TYPE, 3>(p, offset, N);
case 4: return GpuReduceFuncImpl<FLOAT_TYPE, 4>(p, offset, N);
case 5: return GpuReduceFuncImpl<FLOAT_TYPE, 5>(p, offset, N);
case 6: return GpuReduceFuncImpl<FLOAT_TYPE, 6>(p, offset, N);
case 7: return GpuReduceFuncImpl<FLOAT_TYPE, 7>(p, offset, N);
case 8: return GpuReduceFuncImpl<FLOAT_TYPE, 8>(p, offset, N);
case 9: return GpuReduceFuncImpl<FLOAT_TYPE, 9>(p, offset, N);
case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
default: return GpuReduceFuncImpl<FLOAT_TYPE, 1>(p, offset, N);
}
}
// CPU memset kernel
void CpuMemsetKernel(BlockParam const& blockParams)
// GPU copy kernel
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel2(SubExecParam* params)
{
for (int i = 0; i < blockParams.N; i++)
blockParams.dst[i] = 1234.0;
int64_t startCycle = __builtin_amdgcn_s_memrealtime();
SubExecParam& p = params[blockIdx.x];
size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
if (numFloatsLeft)
numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
if (numFloatsLeft)
GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
__threadfence_system();
if (threadIdx.x == 0)
{
p.startCycle = startCycle;
p.stopCycle = __builtin_amdgcn_s_memrealtime();
}
}
#define NUM_GPU_KERNELS 18
typedef void (*GpuKernelFuncPtr)(SubExecParam*);
GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
{
GpuReduceKernel<8>,
GpuReduceKernel<1>,
GpuReduceKernel<2>,
GpuReduceKernel<3>,
GpuReduceKernel<4>,
GpuReduceKernel<5>,
GpuReduceKernel<6>,
GpuReduceKernel<7>,
GpuReduceKernel<8>,
GpuReduceKernel<9>,
GpuReduceKernel<10>,
GpuReduceKernel<11>,
GpuReduceKernel<12>,
GpuReduceKernel<13>,
GpuReduceKernel<14>,
GpuReduceKernel<15>,
GpuReduceKernel<16>,
GpuReduceKernel2
};
std::string GpuKernelNames[NUM_GPU_KERNELS] =
{
"Default - 8xUnroll",
"Unroll x1",
"Unroll x2",
"Unroll x3",
"Unroll x4",
"Unroll x5",
"Unroll x6",
"Unroll x7",
"Unroll x8",
"Unroll x9",
"Unroll x10",
"Unroll x11",
"Unroll x12",
"Unroll x13",
"Unroll x14",
"Unroll x15",
"Unroll x16",
"8xUnrollB",
};
Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......
# Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
ROCM_PATH ?= /opt/rocm
HIPCC=$(ROCM_PATH)/bin/hipcc
EXE=TransferBench
CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64
CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64 -ferror-limit=5
all: $(EXE)
......
This diff is collapsed.
/*
Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -35,20 +35,20 @@ THE SOFTWARE.
#include <hip/hip_ext.h>
#include <hsa/hsa_ext_amd.h>
#include "EnvVars.hpp"
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#include "EnvVars.hpp"
// Simple configuration parameters
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26); // Amount of data transferred per Transfer
......@@ -59,92 +59,92 @@ typedef enum
MEM_GPU = 1, // Coarse-grained global GPU memory
MEM_CPU_FINE = 2, // Fine-grained pinned CPU memory
MEM_GPU_FINE = 3, // Fine-grained global GPU memory
MEM_CPU_UNPINNED = 4 // Unpinned CPU memory
MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
MEM_NULL = 5, // NULL memory - used for empty
} MemType;
bool IsGpuType(MemType m)
{
return (m == MEM_GPU || m == MEM_GPU_FINE);
}
bool IsCpuType(MemType m)
typedef enum
{
return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED);
}
EXE_CPU = 0, // CPU executor (subExecutor = CPU thread)
EXE_GPU_GFX = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
EXE_GPU_DMA = 2, // GPU SDMA-based executor (subExecutor = streams)
} ExeType;
bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); }
bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
char const MemTypeStr[6] = "CGBFU";
char const MemTypeStr[7] = "CGBFUN";
char const ExeTypeStr[4] = "CGD";
char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
MemType inline CharToMemType(char const c)
{
switch (c)
{
case 'C': return MEM_CPU;
case 'G': return MEM_GPU;
case 'B': return MEM_CPU_FINE;
case 'F': return MEM_GPU_FINE;
case 'U': return MEM_CPU_UNPINNED;
default:
printf("[ERROR] Unexpected mem type (%c)\n", c);
char const* val = strchr(MemTypeStr, toupper(c));
if (*val) return (MemType)(val - MemTypeStr);
printf("[ERROR] Unexpected memory type (%c)\n", c);
exit(1);
}
}
typedef enum
{
MODE_FILL = 0, // Fill data with pattern
MODE_CHECK = 1 // Check data against pattern
} ModeType;
// Each threadblock copies N floats from src to dst
struct BlockParam
ExeType inline CharToExeType(char const c)
{
int N;
float* src;
float* dst;
long long startCycle;
long long stopCycle;
};
char const* val = strchr(ExeTypeStr, toupper(c));
if (*val) return (ExeType)(val - ExeTypeStr);
printf("[ERROR] Unexpected executor type (%c)\n", c);
exit(1);
}
// Each Transfer is a uni-direction operation from a src memory to dst memory
// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
// then writes the summation to each of the specified destination memory location(s)
struct Transfer
{
int transferIndex; // Transfer identifier
// Transfer config
MemType exeMemType; // Transfer executor type (CPU or GPU)
int transferIndex; // Transfer identifier (within a Test)
ExeType exeType; // Transfer executor type
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
MemType srcMemType; // Source memory type
int srcIndex; // Source device index
MemType dstMemType; // Destination memory type
int dstIndex; // Destination device index
int numBlocksToUse; // Number of threadblocks to use for this Transfer
size_t numBytes; // Number of bytes to Transfer
size_t numBytesToCopy; // Number of bytes to copy
// Memory
float* srcMem; // Source memory
float* dstMem; // Destination memory
// How memory is split across threadblocks / CPU cores
std::vector<BlockParam> blockParam;
BlockParam* blockParamGpuPtr;
int numSubExecs; // Number of subExecutors to use for this Transfer
size_t numBytes; // # of bytes requested to Transfer (may be 0 to fallback to default)
size_t numBytesActual; // Actual number of bytes to copy
double transferTime; // Time taken in milliseconds
// Results
double transferTime;
int numSrcs; // Number of sources
std::vector<MemType> srcType; // Source memory types
std::vector<int> srcIndex; // Source device indice
std::vector<float*> srcMem; // Source memory
// Prepares src memory and how to divide N elements across threadblocks/threads
void PrepareBlockParams(EnvVars const& ev, size_t const N);
};
int numDsts; // Number of destinations
std::vector<MemType> dstType; // Destination memory type
std::vector<int> dstIndex; // Destination device index
std::vector<float*> dstMem; // Destination memory
std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
typedef std::pair<MemType, int> Executor;
// Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev);
// Prepare source arrays with input data
void PrepareSrc(EnvVars const& ev);
// Validate that destination data contains expected results
void ValidateDst(EnvVars const& ev);
// Prepare reference buffers
void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
// String representation functions
std::string SrcToStr() const;
std::string DstToStr() const;
};
struct ExecutorInfo
{
std::vector<Transfer*> transfers; // Transfers to execute
size_t totalBytes; // Total bytes this executor transfers
int totalSubExecs; // Total number of subExecutors to use
// For GPU-Executors
int totalBlocks; // Total number of CUs/CPU threads to use
BlockParam* blockParamGpu; // Copy of block parameters in GPU device memory
SubExecParam* subExecParamGpu; // GPU copy of subExecutor parameters
std::vector<hipStream_t> streams;
std::vector<hipEvent_t> startEvents;
std::vector<hipEvent_t> stopEvents;
......@@ -153,6 +153,7 @@ struct ExecutorInfo
double totalTime;
};
typedef std::pair<ExeType, int> Executor;
typedef std::map<Executor, ExecutorInfo> TransferMap;
// Display usage instructions
......@@ -166,7 +167,9 @@ void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFacto
std::vector<size_t>& valuesofN);
void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
MemType* memType, int* memIndex);
std::vector<MemType>& memType, std::vector<int>& memIndex);
void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
ExeType& exeType, int& exeIndex);
void ParseTransfers(char* line, int numCpus, int numGpus,
std::vector<Transfer>& transfers);
......@@ -178,26 +181,19 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numBlocksToUse, bool const isRandom);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
// Return the maximum bandwidth measured for given (src/dst) pair
double GetPeakBandwidth(EnvVars const& ev,
size_t const N,
double GetPeakBandwidth(EnvVars const& ev, size_t const N,
int const isBidirectional,
int const readMode,
int const numBlocksToUse,
MemType const srcMemType,
int const srcIndex,
MemType const dstMemType,
int const dstIndex);
MemType const srcType, int const srcIndex,
MemType const dstType, int const dstIndex);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
std::string GetDesc(MemType srcMemType, int srcIndex,
MemType dstMemType, int dstIndex);
std::string GetTransferDesc(Transfer const& transfer);
int RemappedIndex(int const origIdx, MemType const memType);
int RemappedIndex(int const origIdx, bool const isCpuType);
int GetWallClockRate(int deviceId);
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);
# ConfigFile Format:
# ==================
# A Transfer is defined as a uni-directional copy from src memory location to dst memory location
# executed by either CPU or GPU
# A Transfer is defined as a single operation where an Executor reads and adds together
# values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.
# This simplifies to a simple copy operation when dealing with single SRC/DST.
#
# SRC 0 DST 0
# SRC 1 -> Executor -> DST 1
# SRC X DST Y
# Three Executors are supported by TransferBench
# Executor: SubExecutor:
# 1) CPU CPU thread
# 2) GPU GPU threadblock/Compute Unit (CU)
# 3) DMA N/A. (May only be used for copies (single SRC/DST)
# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
# There are two ways to specify a Test:
# 1) Basic
# The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Transfer
# The basic specification assumes the same number of SubExecutors (SE) used per Transfer
# A positive number of Transfers is specified followed by that number of triplets describing each Transfer
# #Transfers #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
# #Transfers #SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
# 2) Advanced
# A negative number of Transfers is specified, followed by quintuplets describing each Transfer
# A non-zero number of bytes specified will override any provided value
# -#Transfers (srcMem1->Executor1->dstMem1 #CUs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #CUsL BytesL)
# -#Transfers (srcMem1->Executor1->dstMem1 #SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #SEsL BytesL)
# Argument Details:
# #Transfers: Number of Transfers to be run in parallel
# #CUs : Number of threadblocks/CUs to use for a GPU-executed Transfer
# srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode
# #SEs : Number of SubExectors to use (CPU threads/ GPU threadblocks)
# srcMemL : Source memory locations (Where the data is to be read from)
# Executor : Executor is specified by a character indicating type, followed by device index (0-indexed)
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# dstMemL : Destination memory location (Where the data is to be written to)
# - D: DMA-executor (Indexed from 0 to # GPUs - 1)
# dstMemL : Destination memory locations (Where the data is to be written to)
# bytesL : Number of bytes to copy (0 means use command-line specified size)
# Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
#
# Memory locations are specified by a character indicating memory type,
# followed by device index (0-indexed)
# Memory locations are specified by one or more (device character / device index) pairs
# Character indicating memory type followed by device index (0-indexed)
# Supported memory locations are:
# - C: Pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - U: Unpinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - B: Fine-grain host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - N: Null memory (index ignored)
# Examples:
# 1 4 (G0->G0->G1) Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
# 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
# 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 2 CUs
# 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
# Lines starting with # will be ignored. Lines starting with ## will be echoed to output
# Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
## Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
1 4 (G0->G0->G1)
# Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
## Single DMA executed Transfer between GPUs 0 and 1
1 1 (G0->D0->G1)
## Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
-2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
## "Memset" by GPU 0 to GPU 0 memory
1 32 (N0->G0->G0)
## "Read-only" by CPU 0
1 4 (C0->C0->N0)
## Broadcast from GPU 0 to GPU 0 and GPU 1
1 16 (G0->G0->G0G1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment