Unverified Commit be0db711 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.06 (#4)

* Updating version to v1.06
* Fixing CPU NUMA allocation
* Fix random sweep repeatability
* Adding unpinned CPU memory as possible memory type
* Adding ability to customize per-transfer byte sizes
* Updating advanced configuration file mode to take in numBytes per Transfer
* Adding logging of sweep tests configuration to lastSweep.cfg
* Add ability to specify #CUs for sweep benchmark
parent 5331f980
# Changelog for TransferBench # Changelog for TransferBench
## v1.06
### Added
- Added unpinned CPU memory type ('U'). May require HSA_XNACK=1 in order to access via GPU executors
- Adding logging of sweep configuration to lastSweep.cfg
- Adding ability to specify number of CUs to use for sweep-based presets
### Changed
- Fixing random sweep repeatibility
- Fixing bug with CPU NUMA node memory allocation
- Modified advanced configuration file format to accept bytes per Transfer
## v1.05 ## v1.05
### Added ### Added
- Topology output now includes NUMA node information - Topology output now includes NUMA node information
......
...@@ -26,7 +26,7 @@ THE SOFTWARE. ...@@ -26,7 +26,7 @@ THE SOFTWARE.
#include <algorithm> #include <algorithm>
#include <random> #include <random>
#include <time.h> #include <time.h>
#define TB_VERSION "1.05" #define TB_VERSION "1.06"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
......
This diff is collapsed.
...@@ -55,18 +55,23 @@ size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26); // Amount of data transferre ...@@ -55,18 +55,23 @@ size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26); // Amount of data transferre
// Different src/dst memory types supported // Different src/dst memory types supported
typedef enum typedef enum
{ {
MEM_CPU = 0, // Coarse-grained pinned CPU memory MEM_CPU = 0, // Coarse-grained pinned CPU memory
MEM_GPU = 1, // Coarse-grained global GPU memory MEM_GPU = 1, // Coarse-grained global GPU memory
MEM_CPU_FINE = 2, // Fine-grained pinned CPU memory MEM_CPU_FINE = 2, // Fine-grained pinned CPU memory
MEM_GPU_FINE = 3 // Fine-grained global GPU memory MEM_GPU_FINE = 3, // Fine-grained global GPU memory
MEM_CPU_UNPINNED = 4 // Unpinned CPU memory
} MemType; } MemType;
bool IsGpuType(MemType m) bool IsGpuType(MemType m)
{ {
return (m == MEM_GPU || m == MEM_GPU_FINE); return (m == MEM_GPU || m == MEM_GPU_FINE);
} }
bool IsCpuType(MemType m)
{
return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED);
}
char const MemTypeStr[5] = "CGBF"; char const MemTypeStr[6] = "CGBFU";
MemType inline CharToMemType(char const c) MemType inline CharToMemType(char const c)
{ {
...@@ -76,6 +81,7 @@ MemType inline CharToMemType(char const c) ...@@ -76,6 +81,7 @@ MemType inline CharToMemType(char const c)
case 'G': return MEM_GPU; case 'G': return MEM_GPU;
case 'B': return MEM_CPU_FINE; case 'B': return MEM_CPU_FINE;
case 'F': return MEM_GPU_FINE; case 'F': return MEM_GPU_FINE;
case 'U': return MEM_CPU_UNPINNED;
default: default:
printf("[ERROR] Unexpected mem type (%c)\n", c); printf("[ERROR] Unexpected mem type (%c)\n", c);
exit(1); exit(1);
...@@ -112,6 +118,7 @@ struct Transfer ...@@ -112,6 +118,7 @@ struct Transfer
int dstIndex; // Destination device index int dstIndex; // Destination device index
int numBlocksToUse; // Number of threadblocks to use for this Transfer int numBlocksToUse; // Number of threadblocks to use for this Transfer
size_t numBytes; // Number of bytes to Transfer size_t numBytes; // Number of bytes to Transfer
size_t numBytesToCopy; // Number of bytes to copy
// Memory // Memory
float* srcMem; // Source memory float* srcMem; // Source memory
...@@ -132,7 +139,7 @@ typedef std::pair<MemType, int> Executor; ...@@ -132,7 +139,7 @@ typedef std::pair<MemType, int> Executor;
struct ExecutorInfo struct ExecutorInfo
{ {
std::vector<Transfer> transfers; // Transfers to execute std::vector<Transfer*> transfers; // Transfers to execute
size_t totalBytes; // Total bytes this executor transfers size_t totalBytes; // Total bytes this executor transfers
// For GPU-Executors // For GPU-Executors
...@@ -164,17 +171,17 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus ...@@ -164,17 +171,17 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
void ParseTransfers(char* line, int numCpus, int numGpus, void ParseTransfers(char* line, int numCpus, int numGpus,
std::vector<Transfer>& transfers); std::vector<Transfer>& transfers);
void ExecuteTransfers(EnvVars const& ev, int testNum, std::vector<size_t> const& valuesOfN, void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
std::vector<Transfer>& transfers); std::vector<Transfer>& transfers, bool verbose = true);
void EnablePeerAccess(int const deviceId, int const peerDeviceId); void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr); void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
void DeallocateMemory(MemType memType, void* memPtr); void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
void CheckPages(char* byteArray, size_t numBytes, int targetId); void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr); void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx); void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu); void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom); void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numBlocksToUse, bool const isRandom);
// Return the maximum bandwidth measured for given (src/dst) pair // Return the maximum bandwidth measured for given (src/dst) pair
double GetPeakBandwidth(EnvVars const& ev, double GetPeakBandwidth(EnvVars const& ev,
...@@ -193,3 +200,4 @@ std::string GetDesc(MemType srcMemType, int srcIndex, ...@@ -193,3 +200,4 @@ std::string GetDesc(MemType srcMemType, int srcIndex,
std::string GetTransferDesc(Transfer const& transfer); std::string GetTransferDesc(Transfer const& transfer);
int RemappedIndex(int const origIdx, MemType const memType); int RemappedIndex(int const origIdx, MemType const memType);
int GetWallClockRate(int deviceId); int GetWallClockRate(int deviceId);
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
# ConfigFile Format: # ConfigFile Format:
# ================== # ==================
# A Transfer is defined as a uni-directional transfer from src memory location to dst memory location # A Transfer is defined as a uni-directional copy from src memory location to dst memory location
# executed by either CPU or GPU # executed by either CPU or GPU
# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
# There are two ways to specify the configuration file: # There are two ways to specify a Test:
# 1) Basic # 1) Basic
# The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Transfer # The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Transfer
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
# #Transfers #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL) # #Transfers #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
# 2) Advanced # 2) Advanced
# The advanced specification allows different number of threadblocks/CUs used per GPU-executed Transfer # A negative number of Transfers is specified, followed by quintuplets describing each Transfer
# A negative number of Transfers is specified, followed by quadruples describing each Transfer # A non-zero number of bytes specified will override any provided value
# -#Transfers (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL) # -#Transfers (srcMem1->Executor1->dstMem1 #CUs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #CUsL BytesL)
# Argument Details: # Argument Details:
# #Transfers: Number of Transfers to be run in parallel # #Transfers: Number of Transfers to be run in parallel
...@@ -25,23 +25,29 @@ ...@@ -25,23 +25,29 @@
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1) # - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1) # - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# dstMemL : Destination memory location (Where the data is to be written to) # dstMemL : Destination memory location (Where the data is to be written to)
# bytesL : Number of bytes to copy (0 means use command-line specified size)
# Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
#
# Memory locations are specified by a character indicating memory type, # Memory locations are specified by a character indicating memory type,
# followed by device index (0-indexed) # followed by device index (0-indexed)
# Supported memory locations are: # Supported memory locations are:
# - C: Pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1]) # - C: Pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - U: Unpinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - B: Fine-grain host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1]) # - B: Fine-grain host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1]) # - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1]) # - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
# Examples: # Examples:
# 1 4 (G0->G0->G1) Single Transfer using 4 CUs on GPU0 to copy from GPU0 to GPU1 # 1 4 (G0->G0->G1) Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
# 1 4 (C1->G2->G0) Single Transfer using 4 CUs on GPU2 to copy from CPU1 to GPU0 # 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Transfers in parallel. GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs # 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Transfers in parallel. GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs # -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 2 CUs
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
# Lines starting with # will be ignored. Lines starting with ## will be echoed to output # Lines starting with # will be ignored. Lines starting with ## will be echoed to output
# Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs # Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
1 4 (G0->G0->G1) 1 4 (G0->G0->G1)
# Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
-2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment