Unverified Commit 0b7b979e authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Minor changes to p2p, a2a (#55)

parent a9cb3a25
# Changelog for TransferBench # Changelog for TransferBench
## v1.28
### Added
- Added A2A_DIRECT which only executes all-to-all only directly connected GPUs (on by default now)
- Added average statistics for p2p and a2a benchmarks
- Added USE_FINE_GRAIN for p2p benchmark.
- With older devices, p2p performance with default coarse grain device memory stops timing as soon as request sent to data fabric,
not actually when it arrives remotely, which may artificially inflate bandwidth numbers, especially when sending small amounts of data
### Modified
- Modified P2P output to help distinguish between CPU / GPU devices
### Fixed
- Fixed Makefile target to prevent unnecessary re-compilation
## v1.27 ## v1.27
### Added ### Added
- Adding cmdline preset to allow specify simple tests on command line - Adding cmdline preset to allow specify simple tests on command line
......
...@@ -7,9 +7,9 @@ NVCC=$(CUDA_PATH)/bin/nvcc ...@@ -7,9 +7,9 @@ NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected # Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found") ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda EXE=../TransferBenchCuda
else else
EXE=TransferBench EXE=../TransferBench
endif endif
CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
...@@ -17,11 +17,11 @@ NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gen ...@@ -17,11 +17,11 @@ NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gen
LDFLAGS += -lpthread LDFLAGS += -lpthread
all: $(EXE) all: $(EXE)
TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp") ../TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $< -o ../$@ $(LDFLAGS) $(HIPCC) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp") ../TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $< -o ../$@ $(LDFLAGS) $(NVCC) $(NVFLAGS) $< -o $@ $(LDFLAGS)
clean: clean:
rm -f *.o ../TransferBench ../TransferBenchCuda rm -f *.o ../TransferBench ../TransferBenchCuda
...@@ -1457,6 +1457,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1457,6 +1457,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
printf("%7s %02d", "CPU", i); printf("%7s %02d", "CPU", i);
if (ev.outputToCsv) printf(","); if (ev.outputToCsv) printf(",");
} }
if (numCpus > 0) printf(" ");
for (int i = 0; i < numGpus; i++) for (int i = 0; i < numGpus; i++)
{ {
printf("%7s %02d", "GPU", i); printf("%7s %02d", "GPU", i);
...@@ -1464,30 +1465,38 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1464,30 +1465,38 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
} }
printf("\n"); printf("\n");
double avgBwSum[2][2] = {};
int avgCount[2][2] = {};
ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX; ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
// Loop over all possible src/dst pairs // Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++) for (int src = 0; src < numDevices; src++)
{ {
MemType const srcType = (src < numCpus ? MEM_CPU : MEM_GPU); MemType const srcType = (src < numCpus ? MEM_CPU : MEM_GPU);
int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus); int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
MemType const srcTypeActual = ((ev.useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
(ev.useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
srcType);
std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1); std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> minBandwidth(isBidirectional + 1); std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1); std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> stdDev(isBidirectional + 1); std::vector<std::vector<double>> stdDev(isBidirectional + 1);
if (src == numCpus && src != 0) printf("\n");
for (int dst = 0; dst < numDevices; dst++) for (int dst = 0; dst < numDevices; dst++)
{ {
MemType const dstType = (dst < numCpus ? MEM_CPU : MEM_GPU); MemType const dstType = (dst < numCpus ? MEM_CPU : MEM_GPU);
int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus); int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
MemType const dstTypeActual = ((ev.useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
(ev.useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
dstType);
// Prepare Transfers // Prepare Transfers
std::vector<Transfer> transfers(isBidirectional + 1); std::vector<Transfer> transfers(isBidirectional + 1);
// SRC -> DST // SRC -> DST
transfers[0].numBytes = N * sizeof(float); transfers[0].numBytes = N * sizeof(float);
transfers[0].srcType.push_back(srcType); transfers[0].srcType.push_back(srcTypeActual);
transfers[0].dstType.push_back(dstType); transfers[0].dstType.push_back(dstTypeActual);
transfers[0].srcIndex.push_back(srcIndex); transfers[0].srcIndex.push_back(srcIndex);
transfers[0].dstIndex.push_back(dstIndex); transfers[0].dstIndex.push_back(dstIndex);
transfers[0].numSrcs = transfers[0].numDsts = 1; transfers[0].numSrcs = transfers[0].numDsts = 1;
...@@ -1500,8 +1509,8 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1500,8 +1509,8 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
{ {
transfers[1].numBytes = N * sizeof(float); transfers[1].numBytes = N * sizeof(float);
transfers[1].numSrcs = transfers[1].numDsts = 1; transfers[1].numSrcs = transfers[1].numDsts = 1;
transfers[1].srcType.push_back(dstType); transfers[1].srcType.push_back(dstTypeActual);
transfers[1].dstType.push_back(srcType); transfers[1].dstType.push_back(srcTypeActual);
transfers[1].srcIndex.push_back(dstIndex); transfers[1].srcIndex.push_back(dstIndex);
transfers[1].dstIndex.push_back(srcIndex); transfers[1].dstIndex.push_back(srcIndex);
transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU; transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
...@@ -1542,6 +1551,12 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1542,6 +1551,12 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
double const avgBw = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f; double const avgBw = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
avgBandwidth[dir].push_back(avgBw); avgBandwidth[dir].push_back(avgBw);
if (!(srcType == dstType && srcIndex == dstIndex))
{
avgBwSum[srcType][dstType] += avgBw;
avgCount[srcType][dstType]++;
}
if (ev.showIterations) if (ev.showIterations)
{ {
double minTime = transfers[dir].perIterationTime[0]; double minTime = transfers[dir].perIterationTime[0];
...@@ -1583,6 +1598,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1583,6 +1598,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for (int dst = 0; dst < numDevices; dst++) for (int dst = 0; dst < numDevices; dst++)
{ {
if (dst == numCpus && dst != 0) printf(" ");
double const avgBw = avgBandwidth[dir][dst]; double const avgBw = avgBandwidth[dir][dst];
if (avgBw == 0.0) if (avgBw == 0.0)
...@@ -1601,6 +1617,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1601,6 +1617,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for (int i = 0; i < numDevices; i++) for (int i = 0; i < numDevices; i++)
{ {
double const minBw = minBandwidth[dir][i]; double const minBw = minBandwidth[dir][i];
if (i == numCpus && i != 0) printf(" ");
if (minBw == 0.0) if (minBw == 0.0)
printf("%10s", "N/A"); printf("%10s", "N/A");
else else
...@@ -1615,6 +1632,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1615,6 +1632,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for (int i = 0; i < numDevices; i++) for (int i = 0; i < numDevices; i++)
{ {
double const maxBw = maxBandwidth[dir][i]; double const maxBw = maxBandwidth[dir][i];
if (i == numCpus && i != 0) printf(" ");
if (maxBw == 0.0) if (maxBw == 0.0)
printf("%10s", "N/A"); printf("%10s", "N/A");
else else
...@@ -1629,6 +1647,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1629,6 +1647,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for (int i = 0; i < numDevices; i++) for (int i = 0; i < numDevices; i++)
{ {
double const sd = stdDev[dir][i]; double const sd = stdDev[dir][i];
if (i == numCpus && i != 0) printf(" ");
if (sd == -1.0) if (sd == -1.0)
printf("%10s", "N/A"); printf("%10s", "N/A");
else else
...@@ -1647,16 +1666,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1647,16 +1666,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for (int dst = 0; dst < numDevices; dst++) for (int dst = 0; dst < numDevices; dst++)
{ {
double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst]; double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
if (dst == numCpus && dst != 0) printf(" ");
if (sumBw == 0.0) if (sumBw == 0.0)
printf("%10s", "N/A"); printf("%10s", "N/A");
else else
printf("%10.2f", sumBw); printf("%10.2f", sumBw);
if (ev.outputToCsv) printf(","); if (ev.outputToCsv) printf(",");
} }
if (src < numDevices - 1) printf("\n\n"); printf("\n");
if (src < numDevices - 1) printf("\n");
} }
} }
printf("\n");
if (!ev.outputToCsv)
{
printf(" ");
for (int srcType : {MEM_CPU, MEM_GPU})
for (int dstType : {MEM_CPU, MEM_GPU})
printf(" %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
printf("\n");
printf("Averages (During %s):", isBidirectional ? " BiDir" : "UniDir");
for (int srcType : {MEM_CPU, MEM_GPU})
for (int dstType : {MEM_CPU, MEM_GPU})
{
if (avgCount[srcType][dstType])
printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
else
printf("%10s", "N/A");
}
printf("\n\n");
}
} }
} }
...@@ -1732,7 +1772,7 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co ...@@ -1732,7 +1772,7 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs) void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs)
{ {
ev.DisplayEnvVars(); ev.DisplayA2AEnvVars();
// Collect the number of GPU devices to use // Collect the number of GPU devices to use
int const numGpus = ev.numGpuDevices; int const numGpus = ev.numGpuDevices;
...@@ -1763,14 +1803,27 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1763,14 +1803,27 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
for (int j = 0; j < numGpus; j++) for (int j = 0; j < numGpus; j++)
{ {
transfer.dstIndex[0] = j; transfer.dstIndex[0] = j;
if (ev.a2aDirect)
{
#if !defined(__NVCC__)
if (i == j) continue;
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
RemappedIndex(j, false),
&linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
transfers.push_back(transfer); transfers.push_back(transfer);
} }
} }
printf("GPU-GFX All-To-All benchmark:\n"); printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n"); printf("==========================\n");
printf("- Copying %lu bytes between every pair of GPUs using %d CUs\n", numBytesPerTransfer, numSubExecs); printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
printf("- All numbers reported as GB/sec\n\n"); numBytesPerTransfer, ev.a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
if (transfers.size() == 0) return;
double totalBandwidthCpu = 0; double totalBandwidthCpu = 0;
ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, true, &totalBandwidthCpu); ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, true, &totalBandwidthCpu);
...@@ -1780,21 +1833,52 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1780,21 +1833,52 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
printf("SRC\\DST"); printf("SRC\\DST");
for (int dst = 0; dst < numGpus; dst++) for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst); printf("%cGPU %02d ", separator, dst);
printf("\n"); printf(" %cSTotal\n", separator);
std::map<std::pair<int, int>, int> reIndex;
for (int i = 0; i < transfers.size(); i++)
{
Transfer const& t = transfers[i];
reIndex[std::make_pair(t.srcIndex[0], t.dstIndex[0])] = i;
}
double totalBandwidthGpu = 0.0;
std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
for (int src = 0; src < numGpus; src++) for (int src = 0; src < numGpus; src++)
{ {
double rowTotalBandwidth = 0;
printf("GPU %02d", src); printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) for (int dst = 0; dst < numGpus; dst++)
{ {
Transfer const& transfer = transfers[src * numGpus + dst]; if (reIndex.count(std::make_pair(src, dst)))
double transferDurationMsec = transfer.transferTime / (1.0 * ev.numIterations); {
double transferBandwidthGbs = (transfer.numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
printf("%c%7.2f ", separator, transferBandwidthGbs); double transferDurationMsec = transfer.transferTime / (1.0 * ev.numIterations);
double transferBandwidthGbs = (transfer.numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
colTotalBandwidth[dst] += transferBandwidthGbs;
rowTotalBandwidth += transferBandwidthGbs;
totalBandwidthGpu += transferBandwidthGbs;
printf("%c%7.2f ", separator, transferBandwidthGbs);
}
else
{
printf("%c%7s ", separator, "N/A");
}
} }
printf("\n"); printf(" %c%7.2f\n", separator, rowTotalBandwidth);
colTotalBandwidth[numGpus] += rowTotalBandwidth;
} }
printf("Aggregate bandwidth (CPU Timed): %7.2f\n", totalBandwidthCpu); printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++)
{
printf("%c%7.2f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%7.2f\n", separator, colTotalBandwidth[numGpus]);
printf("\n");
printf("Average bandwidth (GPU Timed): %7.2f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %7.2f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %7.2f GB/s\n", totalBandwidthCpu);
} }
void Transfer::PrepareSubExecParams(EnvVars const& ev) void Transfer::PrepareSubExecParams(EnvVars const& ev)
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.27" #define TB_VERSION "1.28"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -92,6 +92,7 @@ public: ...@@ -92,6 +92,7 @@ public:
int p2pMode; // Both = 0, Unidirectional = 1, Bidirectional = 2 int p2pMode; // Both = 0, Unidirectional = 1, Bidirectional = 2
int useDmaCopy; // Use DMA copy instead of GPU copy int useDmaCopy; // Use DMA copy instead of GPU copy
int useRemoteRead; // Use destination memory type as executor instead of source memory type int useRemoteRead; // Use destination memory type as executor instead of source memory type
int useFineGrain; // Use fine-grained memory
// Environment variables only for Sweep-preset // Environment variables only for Sweep-preset
int sweepMin; // Min number of simultaneous Transfers to be executed per test int sweepMin; // Min number of simultaneous Transfers to be executed per test
...@@ -106,6 +107,9 @@ public: ...@@ -106,6 +107,9 @@ public:
std::string sweepExe; // Set of executors to be swept std::string sweepExe; // Set of executors to be swept
std::string sweepDst; // Set of dst memory types to be swept std::string sweepDst; // Set of dst memory types to be swept
// Enviroment variables only for A2A preset
int a2aDirect; // Only execute on links that are directly connected
// Developer features // Developer features
int enableDebug; // Enable debug output int enableDebug; // Enable debug output
int gpuKernel; // Which GPU kernel to use int gpuKernel; // Which GPU kernel to use
...@@ -170,11 +174,13 @@ public: ...@@ -170,11 +174,13 @@ public:
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel); gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
// P2P Benchmark related // P2P Benchmark related
useRemoteRead = GetEnvVar("USE_REMOTE_READ" , 0); useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); // Needed for numGpuSubExec
useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0);
numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs);
numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE); numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE);
numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs);
p2pMode = GetEnvVar("P2P_MODE" , 0); p2pMode = GetEnvVar("P2P_MODE" , 0);
useRemoteRead = GetEnvVar("USE_REMOTE_READ" , 0);
useFineGrain = GetEnvVar("USE_FINE_GRAIN" , 0);
// Sweep related // Sweep related
sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN); sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN);
...@@ -188,6 +194,9 @@ public: ...@@ -188,6 +194,9 @@ public:
sweepXgmiMax = GetEnvVar("SWEEP_XGMI_MAX" , -1); sweepXgmiMax = GetEnvVar("SWEEP_XGMI_MAX" , -1);
sweepRandBytes = GetEnvVar("SWEEP_RAND_BYTES" , 0); sweepRandBytes = GetEnvVar("SWEEP_RAND_BYTES" , 0);
// A2A Benchmark related
a2aDirect = GetEnvVar("A2A_DIRECT" , 1);
// Determine random seed // Determine random seed
char *sweepSeedStr = getenv("SWEEP_SEED"); char *sweepSeedStr = getenv("SWEEP_SEED");
sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL)); sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
...@@ -517,6 +526,9 @@ public: ...@@ -517,6 +526,9 @@ public:
std::string("Running ") + (p2pMode == 1 ? "Unidirectional" : std::string("Running ") + (p2pMode == 1 ? "Unidirectional" :
p2pMode == 2 ? "Bidirectional" : p2pMode == 2 ? "Bidirectional" :
"Unidirectional + Bidirectional")); "Unidirectional + Bidirectional"));
PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_GPU_DMA", useDmaCopy, PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor"); std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
PRINT_EV("USE_REMOTE_READ", useRemoteRead, PRINT_EV("USE_REMOTE_READ", useRemoteRead,
...@@ -557,6 +569,17 @@ public: ...@@ -557,6 +569,17 @@ public:
printf("\n"); printf("\n");
} }
void DisplayA2AEnvVars() const
{
DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv)
printf("[AllToAll Related]\n");
PRINT_EV("A2A_DIRECT", a2aDirect,
std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
printf("\n");
}
// Helper function that gets parses environment variable or sets to default value // Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue) static int GetEnvVar(std::string const& varname, int defaultValue)
{ {
......
...@@ -219,8 +219,8 @@ GpuReduceKernel(SubExecParam* params) ...@@ -219,8 +219,8 @@ GpuReduceKernel(SubExecParam* params)
__syncthreads(); __syncthreads();
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
p.startCycle = startCycle;
p.stopCycle = wall_clock64(); p.stopCycle = wall_clock64();
p.startCycle = startCycle;
__trace_hwreg(); __trace_hwreg();
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment