Unverified Commit e6f64e97 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Source Prep Kernel / Better NVCC support (#18)

* Adding source prep kernel (USE_PREP_KERNEL)
* Adding nvcc-only compilation path
* Fix for NVIDIA - set shared mem usage to 0 by default
* Updating default fill pattern for source data
* Restoring missing example.cfg file
parent 02ca0266
# Changelog for TransferBench # Changelog for TransferBench
## v1.17
### Added
- Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL)
- USE_PREP_KERNEL cannot be used with FILL_PATTERN
- Adding ability to compile with nvcc only (TransferBenchCuda)
### Changed
- Default pattern set to [Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)]
### Fixed
- Re-adding example.cfg file
## v1.16 ## v1.16
### Added ### Added
- Additional src array validation during preparation - Additional src array validation during preparation
......
...@@ -37,13 +37,18 @@ $ make ...@@ -37,13 +37,18 @@ $ make
## NVIDIA platform support ## NVIDIA platform support
TransferBench may also be built to run on NVIDIA platforms via HIP, but requires a HIP-compatible CUDA version installed (e.g. CUDA 11.5) TransferBench may also be built to run on NVIDIA platforms either via HIP, or native nvcc
To build: To build with HIP for NVIDIA (requires HIP-compatible CUDA version installed e.g. CUDA 11.5):
``` ```
CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make` CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
``` ```
To build with native nvcc: (Builds TransferBenchCuda)
```
make
```
## Hints and suggestions ## Hints and suggestions
- Running TransferBench with no arguments will display usage instructions and detected topology information - Running TransferBench with no arguments will display usage instructions and detected topology information
- There are several preset configurations that can be used instead of a configuration file - There are several preset configurations that can be used instead of a configuration file
......
# ConfigFile Format:
# ==================
# A Transfer is defined as a single operation where an Executor reads and adds together
# values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.
# This simplifies to a simple copy operation when dealing with single SRC/DST.
#
# SRC 0 DST 0
# SRC 1 -> Executor -> DST 1
# SRC X DST Y
# Three Executors are supported by TransferBench
# Executor: SubExecutor:
# 1) CPU CPU thread
# 2) GPU GPU threadblock/Compute Unit (CU)
# 3) DMA N/A. (May only be used for copies (single SRC/DST)
# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
# There are two ways to specify a Test:
# 1) Basic
# The basic specification assumes the same number of SubExecutors (SE) used per Transfer
# A positive number of Transfers is specified followed by that number of triplets describing each Transfer
# #Transfers #SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
# 2) Advanced
# A negative number of Transfers is specified, followed by quintuplets describing each Transfer
# A non-zero number of bytes specified will override any provided value
# -#Transfers (srcMem1->Executor1->dstMem1 #SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #SEsL BytesL)
# Argument Details:
# #Transfers: Number of Transfers to be run in parallel
# #SEs : Number of SubExectors to use (CPU threads/ GPU threadblocks)
# srcMemL : Source memory locations (Where the data is to be read from)
# Executor : Executor is specified by a character indicating type, followed by device index (0-indexed)
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# - D: DMA-executor (Indexed from 0 to # GPUs - 1)
# dstMemL : Destination memory locations (Where the data is to be written to)
# bytesL : Number of bytes to copy (0 means use command-line specified size)
# Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
#
# Memory locations are specified by one or more (device character / device index) pairs
# Character indicating memory type followed by device index (0-indexed)
# Supported memory locations are:
# - C: Pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - U: Unpinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - B: Fine-grain host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - N: Null memory (index ignored)
# Examples:
# 1 4 (G0->G0->G1) Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
# 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
# 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
# Lines starting with # will be ignored. Lines starting with ## will be echoed to output
## Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
1 4 (G0->G0->G1)
## Single DMA executed Transfer between GPUs 0 and 1
1 1 (G0->D0->G1)
## Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
-2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
## "Memset" by GPU 0 to GPU 0 memory
1 32 (N0->G0->G0)
## "Read-only" by CPU 0
1 4 (C0->C0->N0)
## Broadcast from GPU 0 to GPU 0 and GPU 1
1 16 (G0->G0->G0G1)
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc HIPCC=$(ROCM_PATH)/bin/hipcc
NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
else
EXE=TransferBench
endif
EXE=TransferBench CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
CXXFLAGS = -O3 -I. -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75
LDFLAGS += -lpthread LDFLAGS += -lpthread
all: $(EXE) all: $(EXE)
$(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp") TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $< -o ../$@ $(LDFLAGS) $(HIPCC) $(CXXFLAGS) $< -o ../$@ $(LDFLAGS)
TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $< -o ../$@ $(LDFLAGS)
clean: clean:
rm -f *.o ../$(EXE) rm -f *.o ../TransferBench ../TransferBenchCuda
...@@ -240,6 +240,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -240,6 +240,7 @@ void ExecuteTransfers(EnvVars const& ev,
if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum); if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
// Prepare input memory and block parameters for current N // Prepare input memory and block parameters for current N
bool isSrcCorrect = true;
for (auto& exeInfoPair : transferMap) for (auto& exeInfoPair : transferMap)
{ {
ExecutorInfo& exeInfo = exeInfoPair.second; ExecutorInfo& exeInfo = exeInfoPair.second;
...@@ -251,7 +252,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -251,7 +252,7 @@ void ExecuteTransfers(EnvVars const& ev,
// Prepare subarrays each threadblock works on and fill src memory with patterned data // Prepare subarrays each threadblock works on and fill src memory with patterned data
Transfer* transfer = exeInfo.transfers[i]; Transfer* transfer = exeInfo.transfers[i];
transfer->PrepareSubExecParams(ev); transfer->PrepareSubExecParams(ev);
transfer->PrepareSrc(ev); isSrcCorrect &= transfer->PrepareSrc(ev);
exeInfo.totalBytes += transfer->numBytesActual; exeInfo.totalBytes += transfer->numBytesActual;
// Copy block parameters to GPU for GPU executors // Copy block parameters to GPU for GPU executors
...@@ -271,7 +272,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -271,7 +272,7 @@ void ExecuteTransfers(EnvVars const& ev,
double totalCpuTime = 0; double totalCpuTime = 0;
size_t numTimedIterations = 0; size_t numTimedIterations = 0;
std::stack<std::thread> threads; std::stack<std::thread> threads;
for (int iteration = -ev.numWarmups; ; iteration++) for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
{ {
if (ev.numIterations > 0 && iteration >= ev.numIterations) break; if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break; if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
...@@ -332,7 +333,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -332,7 +333,7 @@ void ExecuteTransfers(EnvVars const& ev,
} }
// Pause for interactive mode // Pause for interactive mode
if (verbose && ev.useInteractive) if (verbose && isSrcCorrect && ev.useInteractive)
{ {
printf("Transfers complete. Hit <Enter> to continue: "); printf("Transfers complete. Hit <Enter> to continue: ");
if (scanf("%*c") != 0) if (scanf("%*c") != 0)
...@@ -358,6 +359,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -358,6 +359,7 @@ void ExecuteTransfers(EnvVars const& ev,
double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime; double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
double maxGpuTime = 0; double maxGpuTime = 0;
if (!isSrcCorrect) goto cleanup;
if (ev.useSingleStream) if (ev.useSingleStream)
{ {
for (auto& exeInfoPair : transferMap) for (auto& exeInfoPair : transferMap)
...@@ -478,6 +480,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -478,6 +480,7 @@ void ExecuteTransfers(EnvVars const& ev,
} }
// Release GPU memory // Release GPU memory
cleanup:
for (auto exeInfoPair : transferMap) for (auto exeInfoPair : transferMap)
{ {
ExecutorInfo& exeInfo = exeInfoPair.second; ExecutorInfo& exeInfo = exeInfoPair.second;
...@@ -1382,7 +1385,7 @@ void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, i ...@@ -1382,7 +1385,7 @@ void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, i
else else
{ {
for (size_t i = 0; i < N; ++i) for (size_t i = 0; i < N; ++i)
buffer[i] = (i % 383 + 31) * (bufferIdx + 1); buffer[i] = PrepSrcValue(bufferIdx, i);
} }
} }
else // Destination buffer else // Destination buffer
...@@ -1412,9 +1415,9 @@ void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, i ...@@ -1412,9 +1415,9 @@ void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, i
} }
} }
void Transfer::PrepareSrc(EnvVars const& ev) bool Transfer::PrepareSrc(EnvVars const& ev)
{ {
if (this->numSrcs == 0) return; if (this->numSrcs == 0) return true;
size_t const N = this->numBytesActual / sizeof(float); size_t const N = this->numBytesActual / sizeof(float);
int const initOffset = ev.byteOffset / sizeof(float); int const initOffset = ev.byteOffset / sizeof(float);
...@@ -1422,7 +1425,17 @@ void Transfer::PrepareSrc(EnvVars const& ev) ...@@ -1422,7 +1425,17 @@ void Transfer::PrepareSrc(EnvVars const& ev)
for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx) for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
{ {
PrepareReference(ev, reference, srcIdx); PrepareReference(ev, reference, srcIdx);
if (ev.usePrepSrcKernel && IsGpuType(this->srcType[srcIdx]))
{
int const srcIndex = RemappedIndex(this->srcIndex[srcIdx], false);
HIP_CALL(hipSetDevice(srcIndex));
PrepSrcDataKernel<<<32, BLOCKSIZE>>>(this->srcMem[srcIdx] + initOffset, N, srcIdx);
HIP_CALL(hipDeviceSynchronize());
}
else
{
HIP_CALL(hipMemcpy(this->srcMem[srcIdx] + initOffset, reference.data(), this->numBytesActual, hipMemcpyDefault)); HIP_CALL(hipMemcpy(this->srcMem[srcIdx] + initOffset, reference.data(), this->numBytesActual, hipMemcpyDefault));
}
// Perform check just to make sure that data has been copied properly // Perform check just to make sure that data has been copied properly
std::vector<float> srcCopy(N); std::vector<float> srcCopy(N);
...@@ -1433,7 +1446,13 @@ void Transfer::PrepareSrc(EnvVars const& ev) ...@@ -1433,7 +1446,13 @@ void Transfer::PrepareSrc(EnvVars const& ev)
if (reference[i] != srcCopy[i]) if (reference[i] != srcCopy[i])
{ {
printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx); printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
#if !defined(__NVCC__)
float const val = this->srcMem[srcIdx][initOffset + i];
printf("[ERROR] SRC %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
srcIdx, srcCopy[i], *(unsigned int*)&srcCopy[i], val, *(unsigned int*)&val);
#else
printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcCopy[i], *(unsigned int*)&srcCopy[i]); printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcCopy[i], *(unsigned int*)&srcCopy[i]);
#endif
printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]); printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n", printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
this->transferIndex, this->transferIndex,
...@@ -1443,9 +1462,11 @@ void Transfer::PrepareSrc(EnvVars const& ev) ...@@ -1443,9 +1462,11 @@ void Transfer::PrepareSrc(EnvVars const& ev)
this->DstToStr().c_str()); this->DstToStr().c_str());
if (!ev.continueOnError) if (!ev.continueOnError)
exit(1); exit(1);
return false;
} }
} }
} }
return true;
} }
void Transfer::ValidateDst(EnvVars const& ev) void Transfer::ValidateDst(EnvVars const& ev)
...@@ -1480,10 +1501,22 @@ void Transfer::ValidateDst(EnvVars const& ev) ...@@ -1480,10 +1501,22 @@ void Transfer::ValidateDst(EnvVars const& ev)
{ {
float srcVal; float srcVal;
HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault)); HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
#if !defined(__NVCC__)
float val = this->srcMem[srcIdx][initOffset + i];
printf("[ERROR] SRC %02dD value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
#else
printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal); printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
#endif
} }
printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]); printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
#if !defined(__NVCC__)
float dstVal = this->dstMem[dstIdx][initOffset + i];
printf("[ERROR] DST %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
#else
printf("[ERROR] DST %02d value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]); printf("[ERROR] DST %02d value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
#endif
printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n", printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
this->transferIndex, this->transferIndex,
this->SrcToStr().c_str(), this->SrcToStr().c_str(),
...@@ -1526,9 +1559,14 @@ int GetWallClockRate(int deviceId) ...@@ -1526,9 +1559,14 @@ int GetWallClockRate(int deviceId)
HIP_CALL(hipGetDeviceCount(&numGpuDevices)); HIP_CALL(hipGetDeviceCount(&numGpuDevices));
wallClockPerDeviceMhz.resize(numGpuDevices); wallClockPerDeviceMhz.resize(numGpuDevices);
hipDeviceProp_t prop;
for (int i = 0; i < numGpuDevices; i++) for (int i = 0; i < numGpuDevices; i++)
{ {
#if defined(__NVCC__)
int value = 1410000;
//HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
//value *= 1000;
#else
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i)); HIP_CALL(hipGetDeviceProperties(&prop, i));
int value = 25000; int value = 25000;
switch (prop.gcnArch) switch (prop.gcnArch)
...@@ -1537,6 +1575,7 @@ int GetWallClockRate(int deviceId) ...@@ -1537,6 +1575,7 @@ int GetWallClockRate(int deviceId)
default: default:
printf("Unrecognized GCN arch %d\n", prop.gcnArch); printf("Unrecognized GCN arch %d\n", prop.gcnArch);
} }
#endif
wallClockPerDeviceMhz[i] = value; wallClockPerDeviceMhz[i] = value;
} }
} }
......
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#if defined(__NVCC__)
#include <cuda_runtime.h>
// ROCm specific
#define __builtin_amdgcn_s_memrealtime clock64
#define gcnArchName name
// Datatypes
#define hipDeviceProp_t cudaDeviceProp
#define hipError_t cudaError_t
#define hipEvent_t cudaEvent_t
#define hipStream_t cudaStream_t
// Enumerations
#define hipDeviceAttributeClockRate cudaDevAttrClockRate
#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor
#define hipDeviceAttributeMultiprocessorCount cudaDevAttrMultiProcessorCount
#define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled
#define hipFuncCachePreferShared cudaFuncCachePreferShared
#define hipMemcpyDefault cudaMemcpyDefault
#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define hipMemcpyHostToDevice cudaMemcpyHostToDevice
#define hipSuccess cudaSuccess
// Functions
#define hipDeviceCanAccessPeer cudaDeviceCanAccessPeer
#define hipDeviceEnablePeerAccess cudaDeviceEnablePeerAccess
#define hipDeviceGetAttribute cudaDeviceGetAttribute
#define hipDeviceGetPCIBusId cudaDeviceGetPCIBusId
#define hipDeviceSetCacheConfig cudaDeviceSetCacheConfig
#define hipDeviceSynchronize cudaDeviceSynchronize
#define hipEventCreate cudaEventCreate
#define hipEventDestroy cudaEventDestroy
#define hipEventElapsedTime cudaEventElapsedTime
#define hipEventRecord cudaEventRecord
#define hipFree cudaFree
#define hipGetDeviceCount cudaGetDeviceCount
#define hipGetDeviceProperties cudaGetDeviceProperties
#define hipGetErrorString cudaGetErrorString
#define hipHostFree cudaFreeHost
#define hipHostMalloc cudaMallocHost
#define hipMalloc cudaMalloc
#define hipMemcpy cudaMemcpy
#define hipMemcpyAsync cudaMemcpyAsync
#define hipMemset cudaMemset
#define hipMemsetAsync cudaMemsetAsync
#define hipSetDevice cudaSetDevice
#define hipStreamCreate cudaStreamCreate
#define hipStreamDestroy cudaStreamDestroy
#define hipStreamSynchronize cudaStreamSynchronize
// Define float4 addition operator for NVIDIA platform
__device__ inline float4& operator +=(float4& a, const float4& b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
a.w += b.w;
return a;
}
#else
#include <hip/hip_ext.h>
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>
#endif
...@@ -26,9 +26,10 @@ THE SOFTWARE. ...@@ -26,9 +26,10 @@ THE SOFTWARE.
#include <algorithm> #include <algorithm>
#include <random> #include <random>
#include <time.h> #include <time.h>
#include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.16" #define TB_VERSION "1.17"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -74,6 +75,7 @@ public: ...@@ -74,6 +75,7 @@ public:
int sharedMemBytes; // Amount of shared memory to use per threadblock int sharedMemBytes; // Amount of shared memory to use per threadblock
int useInteractive; // Pause for user-input before starting transfer loop int useInteractive; // Pause for user-input before starting transfer loop
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
std::vector<float> fillPattern; // Pattern of floats used to fill source data std::vector<float> fillPattern; // Pattern of floats used to fill source data
...@@ -116,6 +118,12 @@ public: ...@@ -116,6 +118,12 @@ public:
int maxSharedMemBytes = 0; int maxSharedMemBytes = 0;
HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes, HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes,
hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0)); hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0));
#if !defined(__NVCC__)
int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1;
#else
int defaultSharedMemBytes = 0;
#endif
int numDeviceCUs = 0; int numDeviceCUs = 0;
HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0)); HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0));
...@@ -143,9 +151,10 @@ public: ...@@ -143,9 +151,10 @@ public:
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS); numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0); outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR); samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1); sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
enableDebug = GetEnvVar("DEBUG" , 0); enableDebug = GetEnvVar("DEBUG" , 0);
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel); gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
...@@ -177,6 +186,12 @@ public: ...@@ -177,6 +186,12 @@ public:
char* pattern = getenv("FILL_PATTERN"); char* pattern = getenv("FILL_PATTERN");
if (pattern != NULL) if (pattern != NULL)
{ {
if (usePrepSrcKernel)
{
printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n");
exit(1);
}
int patternLen = strlen(pattern); int patternLen = strlen(pattern);
if (patternLen % 2) if (patternLen % 2)
{ {
...@@ -365,6 +380,7 @@ public: ...@@ -365,6 +380,7 @@ public:
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n"); printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
} }
...@@ -382,7 +398,7 @@ public: ...@@ -382,7 +398,7 @@ public:
if (fillPattern.size()) if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN")); printf("Pattern: %s", getenv("FILL_PATTERN"));
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31) * (InputIdx + 1)"); printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n"); printf("\n");
printf("%-20s = %12d : Using GPU kernel %d [%s]\n" , "GPU_KERNEL", gpuKernel, gpuKernel, GpuKernelNames[gpuKernel].c_str()); printf("%-20s = %12d : Using GPU kernel %d [%s]\n" , "GPU_KERNEL", gpuKernel, gpuKernel, GpuKernelNames[gpuKernel].c_str());
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices); printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
...@@ -399,6 +415,8 @@ public: ...@@ -399,6 +415,8 @@ public:
useInteractive ? "interactive" : "non-interactive"); useInteractive ? "interactive" : "non-interactive");
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("%-20s = %12d : Using %s to initialize source data\n", "USE_PREP_KERNEL",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM", printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
useSingleStream, (useSingleStream ? "device" : "Transfer")); useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("\n"); printf("\n");
...@@ -413,7 +431,7 @@ public: ...@@ -413,7 +431,7 @@ public:
if (fillPattern.size()) if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN")); printf("Pattern: %s", getenv("FILL_PATTERN"));
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31) * (InputIdx + 1)"); printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n"); printf("\n");
printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices); printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices);
printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices); printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices);
...@@ -423,6 +441,8 @@ public: ...@@ -423,6 +441,8 @@ public:
printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups); printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups);
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes); printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes);
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer")); printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer"));
} }
}; };
...@@ -441,11 +461,12 @@ public: ...@@ -441,11 +461,12 @@ public:
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes); printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset); printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
printf("%-20s = %12d : Continue on error\n", "CONTINUE_ON_ERROR", continueOnError);
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)"); printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size()) if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN")); printf("Pattern: %s", getenv("FILL_PATTERN"));
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31) * (InputIdx + 1)"); printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n"); printf("\n");
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices); printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices); printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
...@@ -459,6 +480,8 @@ public: ...@@ -459,6 +480,8 @@ public:
useInteractive ? "interactive" : "non-interactive"); useInteractive ? "interactive" : "non-interactive");
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("%-20s = %12d : Using %s to initialize source data\n", "USE_PREP_KERNEL",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("\n"); printf("\n");
} }
else else
...@@ -474,7 +497,7 @@ public: ...@@ -474,7 +497,7 @@ public:
if (fillPattern.size()) if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN")); printf("Pattern: %s", getenv("FILL_PATTERN"));
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31) * (InputIdx + 1)"); printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n"); printf("\n");
printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices); printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices);
printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices); printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices);
...@@ -485,6 +508,8 @@ public: ...@@ -485,6 +508,8 @@ public:
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes); printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes);
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer")); printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("\n"); printf("\n");
} }
} }
...@@ -515,7 +540,7 @@ public: ...@@ -515,7 +540,7 @@ public:
if (fillPattern.size()) if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN")); printf("Pattern: %s", getenv("FILL_PATTERN"));
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31)"); printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n"); printf("\n");
printf("%-20s = %12d : Running %d %s per Test\n", "NUM_ITERATIONS", numIterations, printf("%-20s = %12d : Running %d %s per Test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations, numIterations > 0 ? numIterations : -numIterations,
...@@ -527,8 +552,11 @@ public: ...@@ -527,8 +552,11 @@ public:
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes); getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM", printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
useSingleStream, (useSingleStream ? "device" : "Transfer")); useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("%-20s = %12d : Continue on error\n", "CONTINUE_ON_ERROR", continueOnError);
printf("\n"); printf("\n");
} }
else else
...@@ -553,7 +581,7 @@ public: ...@@ -553,7 +581,7 @@ public:
if (fillPattern.size()) if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN")); printf("Pattern: %s", getenv("FILL_PATTERN"));
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31)"); printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n"); printf("\n");
printf("NUM_ITERATIONS,%d,Running %d %s per Test\n", numIterations, printf("NUM_ITERATIONS,%d,Running %d %s per Test\n", numIterations,
numIterations > 0 ? numIterations : -numIterations, numIterations > 0 ? numIterations : -numIterations,
...@@ -561,6 +589,8 @@ public: ...@@ -561,6 +589,8 @@ public:
printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups); printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups);
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes); printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes);
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer")); printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer"));
} }
}; };
......
...@@ -34,6 +34,7 @@ THE SOFTWARE. ...@@ -34,6 +34,7 @@ THE SOFTWARE.
} while (0) } while (0)
// Structure to hold HSA agent information // Structure to hold HSA agent information
#if !defined(__NVCC__)
struct AgentData struct AgentData
{ {
bool isInitialized; bool isInitialized;
...@@ -128,11 +129,15 @@ AgentData& GetAgentData() ...@@ -128,11 +129,15 @@ AgentData& GetAgentData()
} }
return agentData; return agentData;
} }
#endif
// Returns closest CPU NUMA node to provided GPU // Returns closest CPU NUMA node to provided GPU
// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing // NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
int GetClosestNumaNode(int gpuIdx) int GetClosestNumaNode(int gpuIdx)
{ {
#if defined(__NVCC__)
return -1;
#else
AgentData& agentData = GetAgentData(); AgentData& agentData = GetAgentData();
if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size()) if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
{ {
...@@ -140,4 +145,5 @@ int GetClosestNumaNode(int gpuIdx) ...@@ -140,4 +145,5 @@ int GetClosestNumaNode(int gpuIdx)
exit(1); exit(1);
} }
return agentData.closestNumaNode[gpuIdx]; return agentData.closestNumaNode[gpuIdx];
#endif
} }
...@@ -29,19 +29,6 @@ THE SOFTWARE. ...@@ -29,19 +29,6 @@ THE SOFTWARE.
#define MEMSET_CHAR 75 #define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f #define MEMSET_VAL 13323083.0f
#if defined(__NVCC__)
// Define float4 addition operator for NVIDIA platform
__device__ inline float4& operator +=(float4& a, const float4& b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
a.w += b.w;
return a;
}
#endif
// Each subExecutor is provided with subarrays to work on // Each subExecutor is provided with subarrays to work on
#define MAX_SRCS 16 #define MAX_SRCS 16
#define MAX_DSTS 16 #define MAX_DSTS 16
...@@ -85,6 +72,28 @@ void CpuReduceKernel(SubExecParam const& p) ...@@ -85,6 +72,28 @@ void CpuReduceKernel(SubExecParam const& p)
} }
} }
std::string PrepSrcValueString()
{
return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
}
__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
{
return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
}
// GPU kernel to prepare src buffer data
__global__ void
PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
{
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += blockDim.x * gridDim.x)
{
ptr[idx] = PrepSrcValue(srcBufferIdx, idx);
}
}
// Helper function for memset // Helper function for memset
template <typename T> __device__ __forceinline__ T MemsetVal(); template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; }; template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
......
...@@ -32,15 +32,7 @@ THE SOFTWARE. ...@@ -32,15 +32,7 @@ THE SOFTWARE.
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#if defined(__NVCC__) #include "Compatibility.hpp"
#include <cuda_runtime.h>
#define __builtin_amdgcn_s_memrealtime clock64
#else
#include <hip/hip_ext.h>
#endif
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>
// Helper macro for catching HIP errors // Helper macro for catching HIP errors
#define HIP_CALL(cmd) \ #define HIP_CALL(cmd) \
...@@ -131,7 +123,7 @@ struct Transfer ...@@ -131,7 +123,7 @@ struct Transfer
void PrepareSubExecParams(EnvVars const& ev); void PrepareSubExecParams(EnvVars const& ev);
// Prepare source arrays with input data // Prepare source arrays with input data
void PrepareSrc(EnvVars const& ev); bool PrepareSrc(EnvVars const& ev);
// Validate that destination data contains expected results // Validate that destination data contains expected results
void ValidateDst(EnvVars const& ev); void ValidateDst(EnvVars const& ev);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment