Unverified Commit 93e9d262 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Micro benchmark - Add simultanneously all-to-host / host-to-all...

Benchmarks: Micro benchmark - Add simultanneously all-to-host / host-to-all bandwidth testcases to nvbandwidth (#736)

**Description**
Add simultanneously all-to-host / host-to-all bandwidth testcases to
nvbandwidth .

**Major Revision**
- nvbandwidth.patch: Add simultanneously all-to-host / host-to-all
bandwidth testcases to nvbandwidth
- upgrade nvbandwidth submodule into v0.8
- add patch into makefile build
parent 76066b6d
......@@ -299,5 +299,5 @@ endif
# Build nvbandwidth.
nvbandwidth: sb_micro_path
cd ./nvbandwidth && cmake . && make && cd ..
cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
Subproject commit 445d8aef742e8a48a69779a939996f9e8863df9d
Subproject commit fb851de841a0b1fb261cbc3a6fe626f17a19ba0f
diff --git a/nvbandwidth.cpp b/nvbandwidth.cpp
index 61a228f..488372a 100644
--- a/nvbandwidth.cpp
+++ b/nvbandwidth.cpp
@@ -29,6 +29,7 @@
#include "kernels.cuh"
#include "output.h"
#include "testcase.h"
+#include "testcases_patched.h"
#include "version.h"
#include "inline_common.h"
@@ -73,8 +74,10 @@ std::vector<Testcase*> createTestcases() {
new DeviceToDeviceBidirWriteCE(),
new AllToHostCE(),
new AllToHostBidirCE(),
+ new AllToHostBatchCE(),
new HostToAllCE(),
new HostToAllBidirCE(),
+ new HostToAllBatchCE(),
new AllToOneWriteCE(),
new AllToOneReadCE(),
new OneToAllWriteCE(),
@@ -89,8 +92,10 @@ std::vector<Testcase*> createTestcases() {
new DeviceToDeviceBidirWriteSM(),
new AllToHostSM(),
new AllToHostBidirSM(),
+ new AllToHostBatchSM(),
new HostToAllSM(),
new HostToAllBidirSM(),
+ new HostToAllBatchSM(),
new AllToOneWriteSM(),
new AllToOneReadSM(),
new OneToAllWriteSM(),
diff --git a/testcase.h b/testcase.h
index c276850..f26e7d8 100644
--- a/testcase.h
+++ b/testcase.h
@@ -39,6 +39,7 @@ class Testcase {
void oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead);
void allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
void allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
+ void allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
void latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency);
public:
// Copyright(c) Microsoft Corporation.
// Licensed under the MIT License.
#include "common.h"
#include "inline_common.h"
#include "output.h"
#include "testcase.h"
// All to Host Batch CE memcpy using cuMemcpyAsync
class AllToHostBatchCE : public Testcase {
public:
AllToHostBatchCE()
: Testcase("all_to_host_batch_memcpy_ce",
"\tMeasures bandwidth of cuMemcpyAsync from all devices to host simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~AllToHostBatchCE() {}
void run(unsigned long long size, unsigned long long loopCount);
};
// Host to All Batch CE memcpy using cuMemcpyAsync
class HostToAllBatchCE : public Testcase {
public:
HostToAllBatchCE()
: Testcase("host_to_all_batch_memcpy_ce",
"\tMeasures bandwidth of cuMemcpyAsync from host to all devices simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~HostToAllBatchCE() {}
void run(unsigned long long size, unsigned long long loopCount);
};
// All to Host Batch SM memcpy using a copy kernel
class AllToHostBatchSM : public Testcase {
public:
AllToHostBatchSM()
: Testcase("all_to_host_batch_memcpy_sm",
"\tMeasures bandwidth of copy kernels from all devices to host simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~AllToHostBatchSM() {}
void run(unsigned long long size, unsigned long long loopCount);
};
// Host to All Batch SM memcpy using a copy kernel
class HostToAllBatchSM : public Testcase {
public:
HostToAllBatchSM()
: Testcase("host_to_all_batch_memcpy_sm",
"\tMeasures bandwidth of copy kernels from host to all devices simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~HostToAllBatchSM() {}
void run(unsigned long long size, unsigned long long loopCount);
};
void Testcase::allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance,
PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) {
std::vector<const MemcpyBuffer *> allSrcBuffers;
std::vector<const MemcpyBuffer *> allDstBuffers;
// Create buffers for all devices with the same size
for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
if (sourceIsHost) {
allSrcBuffers.push_back(new HostBuffer(size, deviceId));
allDstBuffers.push_back(new DeviceBuffer(size, deviceId));
} else {
allSrcBuffers.push_back(new DeviceBuffer(size, deviceId));
allDstBuffers.push_back(new HostBuffer(size, deviceId));
}
}
// Perform memcpy for all devices in a single run and get individual bandwidths
std::vector<double> deviceBandwidths = memcpyInstance.doMemcpyVector(allSrcBuffers, allDstBuffers);
// Store individual bandwidth for each device
for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
bandwidthValues.value(0, deviceId) = deviceBandwidths[deviceId];
}
// Clean up all buffers
for (auto node : allSrcBuffers) {
delete node;
}
for (auto node : allDstBuffers) {
delete node;
}
}
void AllToHostBatchCE::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) batch bandwidth (GB/s)");
}
void HostToAllBatchCE::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) batch bandwidth (GB/s)");
}
void AllToHostBatchSM::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) batch bandwidth (GB/s)");
}
void HostToAllBatchSM::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) batch bandwidth (GB/s)");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment