Commit b389fac3 authored by Ramesh Errabolu's avatar Ramesh Errabolu
Browse files

Changes to support Concurrent copies

parent f5ee8791
...@@ -159,6 +159,88 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi ...@@ -159,6 +159,88 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi
return (err_ == HSA_STATUS_SUCCESS); return (err_ == HSA_STATUS_SUCCESS);
} }
void RocmBandwidthTest::AllocateConcurrentCopyResources(bool bidir,
vector<async_trans_t>& trans_list,
vector<void*>& buf_list,
vector<hsa_agent_t>& dev_list,
vector<uint32_t>& dev_idx_list,
vector<hsa_signal_t>& sig_list,
vector<hsa_amd_memory_pool_t>& pool_list) {
// Number of Unidirectional or Bidirectional
// Concurrent Copy transactions in user request
uint32_t trans_cnt = trans_list.size();
size_t max_size = size_list_.back();
// Common variables used in different loops
void* buf_src;
void* buf_dst;
uint32_t src_idx;
uint32_t dst_idx;
hsa_signal_t signal;
hsa_agent_t src_dev;
hsa_agent_t dst_dev;
uint32_t src_dev_idx;
uint32_t dst_dev_idx;
hsa_amd_memory_pool_t src_pool;
hsa_amd_memory_pool_t dst_pool;
// Allocate buffers for the various transactions
for (uint32_t idx = 0; idx < trans_cnt; idx++) {
async_trans_t& trans = trans_list[idx];
src_idx = trans.copy.src_idx_;
dst_idx = trans.copy.dst_idx_;
src_pool = trans.copy.src_pool_;
dst_pool = trans.copy.dst_pool_;
src_dev = pool_list_[src_idx].owner_agent_;
dst_dev = pool_list_[dst_idx].owner_agent_;
src_dev_idx = pool_list_[src_idx].agent_index_;
dst_dev_idx = pool_list_[dst_idx].agent_index_;
// Allocate buffers and signal for forward copy operation
AllocateCopyBuffers(max_size,
buf_src, src_pool,
buf_dst, dst_pool);
err_ = hsa_signal_create(1, 0, NULL, &signal);
ErrorCheck(err_);
// Acquire access to destination buffers
AcquirePoolAcceses(src_dev_idx, src_dev, buf_src,
dst_dev_idx, dst_dev, buf_dst);
sig_list.push_back(signal);
buf_list.push_back(buf_src);
buf_list.push_back(buf_dst);
dev_list.push_back(src_dev);
dev_list.push_back(dst_dev);
dev_idx_list.push_back(src_dev_idx);
dev_idx_list.push_back(dst_dev_idx);
// For bidirectional copies allocate buffers
// and signal for reverse direction as well
if (bidir) {
AllocateCopyBuffers(max_size,
buf_src, dst_pool,
buf_dst, src_pool);
err_ = hsa_signal_create(1, 0, NULL, &signal);
ErrorCheck(err_);
// Acquire access to destination buffers
AcquirePoolAcceses(dst_dev_idx, dst_dev, buf_src,
src_dev_idx, src_dev, buf_dst);
sig_list.push_back(signal);
buf_list.push_back(buf_src);
buf_list.push_back(buf_dst);
dev_list.push_back(dst_dev);
dev_list.push_back(src_dev);
dev_idx_list.push_back(dst_dev_idx);
dev_idx_list.push_back(src_dev_idx);
}
}
}
void RocmBandwidthTest::AllocateCopyBuffers(size_t size, void RocmBandwidthTest::AllocateCopyBuffers(size_t size,
void*& src, hsa_amd_memory_pool_t src_pool, void*& src, hsa_amd_memory_pool_t src_pool,
void*& dst, hsa_amd_memory_pool_t dst_pool) { void*& dst, hsa_amd_memory_pool_t dst_pool) {
...@@ -231,7 +313,6 @@ void RocmBandwidthTest::WaitForCopyCompletion(vector<hsa_signal_t>& signal_list) ...@@ -231,7 +313,6 @@ void RocmBandwidthTest::WaitForCopyCompletion(vector<hsa_signal_t>& signal_list)
uint32_t size = signal_list.size(); uint32_t size = signal_list.size();
for (uint32_t idx = 0; idx < size; idx++) { for (uint32_t idx = 0; idx < size; idx++) {
hsa_signal_t signal = signal_list[idx]; hsa_signal_t signal = signal_list[idx];
// Wait for copy operation to complete
while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT,
1, uint64_t(-1), policy)); 1, uint64_t(-1), policy));
} }
...@@ -252,6 +333,128 @@ void RocmBandwidthTest::copy_buffer(void* dst, hsa_agent_t dst_agent, ...@@ -252,6 +333,128 @@ void RocmBandwidthTest::copy_buffer(void* dst, hsa_agent_t dst_agent,
uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); uint64_t(-1), HSA_WAIT_STATE_ACTIVE));
} }
void RocmBandwidthTest::RunConcurrentCopyBenchmark(bool bidir,
vector<async_trans_t>& trans_list) {
// Number of Unidirectional or Bidirectional
// Concurrent Copy transactions in user request
uint32_t trans_cnt = trans_list.size();
size_t max_size = size_list_.back();
uint32_t size_len = size_list_.size();
// Lists of buffers, pools, agents and signals
// used to run copy requests
vector<void*> buf_list;
vector<hsa_agent_t> dev_list;
vector<uint32_t> dev_idx_list;
vector<hsa_signal_t> sig_list;
vector<hsa_amd_memory_pool_t> pool_list;
// Allocate resources for the various transactions
AllocateConcurrentCopyResources(bidir, trans_list,
buf_list, dev_list,
dev_idx_list, sig_list, pool_list);
// Common variables used in different loops
void* buf_src;
void* buf_dst;
hsa_agent_t src_dev;
hsa_agent_t dst_dev;
hsa_signal_t signal;
// Signa to trigger all copy requests to wait
// until allowed to begin
hsa_signal_t sig_grp_start;
err_ = hsa_signal_create(1, 0, NULL, &sig_grp_start);
ErrorCheck(err_);
// Bind the number of iterations
uint32_t iterations = GetIterationNum();
// Iterate through the differnt buffer sizes to
// compute the bandwidth as determined by copy
for (uint32_t idx = 0; idx < size_len; idx++) {
// This should not be happening
size_t curr_size = size_list_[idx];
if (curr_size > max_size) {
break;
}
std::vector< std::vector<double> > gpu_time_list(trans_cnt, std::vector<double>());
for (uint32_t it = 0; it < iterations; it++) {
if (it % 2) {
printf(".");
fflush(stdout);
}
// Set group trigger signal
hsa_signal_store_relaxed(sig_grp_start, 1);
// Update signal value to one before submitting copy requests
uint32_t sig_idx = 0;
uint32_t sig_cnt = sig_list.size();
for (sig_idx = 0; sig_idx < sig_cnt; sig_idx++) {
signal = sig_list[sig_idx];
hsa_signal_store_relaxed(signal, 1);
}
// Submit copy operations in batch mode
uint32_t rsrc_idx = 0;
uint32_t cpy_cnt = (bidir) ? (trans_cnt * 2) : trans_cnt;
for (uint32_t cpy_idx = 0; cpy_idx < cpy_cnt; cpy_idx++) {
sig_idx = cpy_idx;
rsrc_idx = cpy_idx * 2;
signal = sig_list[sig_idx + 0];
buf_src = buf_list[rsrc_idx + 0];
buf_dst = buf_list[rsrc_idx + 1];
src_dev = dev_list[rsrc_idx + 0];
dst_dev = dev_list[rsrc_idx + 1];
err_ = hsa_amd_memory_async_copy(buf_dst, dst_dev,
buf_src, src_dev, curr_size,
1, &sig_grp_start, signal);
ErrorCheck(err_);
}
// Set group trigger signal
hsa_signal_store_relaxed(sig_grp_start, 0);
// Wait for the copy operations to complete
WaitForCopyCompletion(sig_list);
// Retrieve times for each copy operation
hsa_signal_t signal_rev;
for (uint32_t tidx = 0; tidx < trans_cnt; tidx++) {
sig_idx = (bidir) ? (tidx * 2) : (tidx);
signal = sig_list[sig_idx + 0];
signal_rev = (bidir) ? (sig_list[sig_idx + 1]) : signal;
double temp = GetGpuCopyTime(bidir, signal, signal_rev);
std::vector<double>& gpu_time = gpu_time_list[tidx];
gpu_time.push_back(temp);
}
}
// Update time taken to copy a particular size
// Get Gpu min and mean copy times
for (uint32_t tidx = 0; tidx < trans_cnt; tidx++) {
async_trans_t& trans = trans_list[tidx];
std::vector<double>& gpu_time = gpu_time_list[tidx];
double min_time = GetMinTime(gpu_time);
double mean_time = GetMeanTime(gpu_time);
trans.gpu_min_time_.push_back(min_time);
trans.gpu_avg_time_.push_back(mean_time);
gpu_time.clear();
}
}
// Free up buffers and signal objects used in copy operation
sig_list.push_back(sig_grp_start);
ReleaseSignals(sig_list);
ReleaseBuffers(buf_list);
}
void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) { void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
// Bind if this transaction is bidirectional // Bind if this transaction is bidirectional
...@@ -458,6 +661,16 @@ void RocmBandwidthTest::Run() { ...@@ -458,6 +661,16 @@ void RocmBandwidthTest::Run() {
ErrorCheck(err_); ErrorCheck(err_);
} }
if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) ||
(req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) {
bool bidir = (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR);
RunConcurrentCopyBenchmark(bidir, trans_list_);
ComputeCopyTime(trans_list_);
err_ = hsa_amd_profiling_async_copy_enable(false);
ErrorCheck(err_);
return;
}
// Iterate through the list of transactions and execute them // Iterate through the list of transactions and execute them
uint32_t trans_size = trans_list_.size(); uint32_t trans_size = trans_list_.size();
for (uint32_t idx = 0; idx < trans_size; idx++) { for (uint32_t idx = 0; idx < trans_size; idx++) {
...@@ -474,7 +687,6 @@ void RocmBandwidthTest::Run() { ...@@ -474,7 +687,6 @@ void RocmBandwidthTest::Run() {
RunIOBenchmark(trans); RunIOBenchmark(trans);
} }
} }
std::cout << std::endl;
// Disable profiling of Async Copy Activity // Disable profiling of Async Copy Activity
if (print_cpu_time_ == false) { if (print_cpu_time_ == false) {
...@@ -535,6 +747,8 @@ RocmBandwidthTest::RocmBandwidthTest(int argc, char** argv) : BaseTest() { ...@@ -535,6 +747,8 @@ RocmBandwidthTest::RocmBandwidthTest(int argc, char** argv) : BaseTest() {
req_copy_unidir_ = REQ_INVALID; req_copy_unidir_ = REQ_INVALID;
req_copy_all_bidir_ = REQ_INVALID; req_copy_all_bidir_ = REQ_INVALID;
req_copy_all_unidir_ = REQ_INVALID; req_copy_all_unidir_ = REQ_INVALID;
req_concurrent_copy_bidir_ = REQ_INVALID;
req_concurrent_copy_unidir_ = REQ_INVALID;
access_matrix_ = NULL; access_matrix_ = NULL;
link_type_matrix_ = NULL; link_type_matrix_ = NULL;
......
...@@ -172,7 +172,9 @@ typedef enum Request_Type { ...@@ -172,7 +172,9 @@ typedef enum Request_Type {
REQ_COPY_UNIDIR = 6, REQ_COPY_UNIDIR = 6,
REQ_COPY_ALL_BIDIR = 7, REQ_COPY_ALL_BIDIR = 7,
REQ_COPY_ALL_UNIDIR = 8, REQ_COPY_ALL_UNIDIR = 8,
REQ_INVALID = 9, REQ_CONCURRENT_COPY_BIDIR = 9,
REQ_CONCURRENT_COPY_UNIDIR = 10,
REQ_INVALID = 11,
} Request_Type; } Request_Type;
...@@ -247,14 +249,18 @@ class RocmBandwidthTest : public BaseTest { ...@@ -247,14 +249,18 @@ class RocmBandwidthTest : public BaseTest {
// @brief: Run copy requests of users // @brief: Run copy requests of users
void RunCopyBenchmark(async_trans_t& trans); void RunCopyBenchmark(async_trans_t& trans);
// @brief: Run copy requests of users
void RunConcurrentCopyBenchmark(bool bidir,
vector<async_trans_t>& trans_list);
// @brief: Get iteration number // @brief: Get iteration number
uint32_t GetIterationNum(); uint32_t GetIterationNum();
// @brief: Get the mean copy time // @brief: Get the mean copy time
double GetMeanTime(std::vector<double>& vec); double GetMeanTime(vector<double>& vec);
// @brief: Get the min copy time // @brief: Get the min copy time
double GetMinTime(std::vector<double>& vec); double GetMinTime(vector<double>& vec);
// @brief: Dispaly Benchmark result // @brief: Dispaly Benchmark result
void PopulatePerfMatrix(bool peak, double* perf_matrix) const; void PopulatePerfMatrix(bool peak, double* perf_matrix) const;
...@@ -280,6 +286,7 @@ class RocmBandwidthTest : public BaseTest { ...@@ -280,6 +286,7 @@ class RocmBandwidthTest : public BaseTest {
bool ValidateBidirCopyReq(); bool ValidateBidirCopyReq();
bool ValidateUnidirCopyReq(); bool ValidateUnidirCopyReq();
bool ValidateConcurrentCopyReq();
bool ValidateCopyReq(vector<size_t>& in_list); bool ValidateCopyReq(vector<size_t>& in_list);
void PrintIOAccessError(uint32_t agent_idx, uint32_t pool_idx); void PrintIOAccessError(uint32_t agent_idx, uint32_t pool_idx);
void PrintCopyAccessError(uint32_t src_pool_idx, uint32_t dst_pool_idx); void PrintCopyAccessError(uint32_t src_pool_idx, uint32_t dst_pool_idx);
...@@ -289,6 +296,7 @@ class RocmBandwidthTest : public BaseTest { ...@@ -289,6 +296,7 @@ class RocmBandwidthTest : public BaseTest {
// @brief: Builds a list of transaction per user request // @brief: Builds a list of transaction per user request
void ComputeCopyTime(async_trans_t& trans); void ComputeCopyTime(async_trans_t& trans);
void ComputeCopyTime(vector<async_trans_t>& trans_list);
void BuildDeviceList(); void BuildDeviceList();
void BuildBufferList(); void BuildBufferList();
bool BuildTransList(); bool BuildTransList();
...@@ -303,6 +311,8 @@ class RocmBandwidthTest : public BaseTest { ...@@ -303,6 +311,8 @@ class RocmBandwidthTest : public BaseTest {
bool BuildCopyTrans(uint32_t req_type, bool BuildCopyTrans(uint32_t req_type,
vector<size_t>& src_list, vector<size_t>& src_list,
vector<size_t>& dst_list); vector<size_t>& dst_list);
bool BuildConcurrentCopyTrans(uint32_t req_type,
vector<size_t>& dev_list);
void WaitForCopyCompletion(vector<hsa_signal_t>& signal_list); void WaitForCopyCompletion(vector<hsa_signal_t>& signal_list);
...@@ -310,8 +320,16 @@ class RocmBandwidthTest : public BaseTest { ...@@ -310,8 +320,16 @@ class RocmBandwidthTest : public BaseTest {
void*& src, hsa_amd_memory_pool_t src_pool, void*& src, hsa_amd_memory_pool_t src_pool,
void*& dst, hsa_amd_memory_pool_t dst_pool); void*& dst, hsa_amd_memory_pool_t dst_pool);
void ReleaseBuffers(std::vector<void*>& buffer_list); void AllocateConcurrentCopyResources(bool bidir,
void ReleaseSignals(std::vector<hsa_signal_t>& signal_list); vector<async_trans_t>& trans_list,
vector<void*>& buffer_list,
vector<hsa_agent_t>& dev_list,
vector<uint32_t>& dev_idx_list,
vector<hsa_signal_t>& sig_list,
vector<hsa_amd_memory_pool_t>& pool_list);
void ReleaseBuffers(vector<void*>& buffer_list);
void ReleaseSignals(vector<hsa_signal_t>& signal_list);
double GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, hsa_signal_t signal_rev); double GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, hsa_signal_t signal_rev);
...@@ -329,7 +347,7 @@ class RocmBandwidthTest : public BaseTest { ...@@ -329,7 +347,7 @@ class RocmBandwidthTest : public BaseTest {
bool fine_grained); bool fine_grained);
// Find the mirror transaction if present // Find the mirror transaction if present
bool FindMirrorRequest(uint32_t src_idx, uint32_t dst_idx); bool FindMirrorRequest(bool reverse, uint32_t src_idx, uint32_t dst_idx);
// @brief: Check if agent and access memory pool, if so, set // @brief: Check if agent and access memory pool, if so, set
// access to the agent, if not, exit // access to the agent, if not, exit
...@@ -378,9 +396,6 @@ class RocmBandwidthTest : public BaseTest { ...@@ -378,9 +396,6 @@ class RocmBandwidthTest : public BaseTest {
void PrintVersion() const; void PrintVersion() const;
std::string GetVersion() const; std::string GetVersion() const;
// More variables declared for testing
// vector<transaction> tran_;
// Used to help count agent_info // Used to help count agent_info
uint32_t agent_index_; uint32_t agent_index_;
...@@ -434,6 +449,8 @@ class RocmBandwidthTest : public BaseTest { ...@@ -434,6 +449,8 @@ class RocmBandwidthTest : public BaseTest {
uint32_t req_copy_unidir_; uint32_t req_copy_unidir_;
uint32_t req_copy_all_bidir_; uint32_t req_copy_all_bidir_;
uint32_t req_copy_all_unidir_; uint32_t req_copy_all_unidir_;
uint32_t req_concurrent_copy_bidir_;
uint32_t req_concurrent_copy_unidir_;
static const uint32_t USR_SRC_FLAG = 0x01; static const uint32_t USR_SRC_FLAG = 0x01;
static const uint32_t USR_DST_FLAG = 0x02; static const uint32_t USR_DST_FLAG = 0x02;
......
...@@ -216,6 +216,22 @@ void RocmBandwidthTest::ValidateInputFlags(uint32_t pf_cnt, ...@@ -216,6 +216,22 @@ void RocmBandwidthTest::ValidateInputFlags(uint32_t pf_cnt,
return ValidateCopyAllUnidirFlags(copy_ctrl_mask); return ValidateCopyAllUnidirFlags(copy_ctrl_mask);
} }
// Input is requesting to run concurrent copies
// rocm_bandwidth_test -k or -K
// It is illegal to specify secondary flags
if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) ||
(req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) {
if ((copy_ctrl_mask & DEV_COPY_LATENCY) ||
(copy_ctrl_mask & USR_BUFFER_INIT) ||
(copy_ctrl_mask & USR_BUFFER_SIZE) ||
(copy_ctrl_mask & CPU_VISIBLE_TIME) ||
(copy_ctrl_mask & VALIDATE_COPY_OP)) {
PrintHelpScreen();
exit(0);
}
return;
}
std::cout << "ValidateInputFlags: This should not be happening" << std::endl; std::cout << "ValidateInputFlags: This should not be happening" << std::endl;
assert(false); assert(false);
return; return;
...@@ -274,6 +290,11 @@ void RocmBandwidthTest::BuildBufferList() { ...@@ -274,6 +290,11 @@ void RocmBandwidthTest::BuildBufferList() {
if (req_copy_bidir_ == REQ_COPY_BIDIR) { if (req_copy_bidir_ == REQ_COPY_BIDIR) {
size_list_.push_back(SIZE_LIST[idx]); size_list_.push_back(SIZE_LIST[idx]);
} }
if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) ||
(req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) {
size_list_.push_back(SIZE_LIST[idx]);
}
} }
} }
...@@ -291,7 +312,7 @@ void RocmBandwidthTest::ParseArguments() { ...@@ -291,7 +312,7 @@ void RocmBandwidthTest::ParseArguments() {
int opt; int opt;
bool status; bool status;
while ((opt = getopt(usr_argc_, usr_argv_, "hqtclvaAb:i:s:d:r:w:m:")) != -1) { while ((opt = getopt(usr_argc_, usr_argv_, "hqtclvaAb:i:s:d:r:w:m:k:K:")) != -1) {
switch (opt) { switch (opt) {
// Print help screen // Print help screen
...@@ -358,6 +379,22 @@ void RocmBandwidthTest::ParseArguments() { ...@@ -358,6 +379,22 @@ void RocmBandwidthTest::ParseArguments() {
print_help = true; print_help = true;
break; break;
// Collect list of agents involved in concurrent copy operation
case 'k':
case 'K':
status = ParseOptionValue(optarg, bidir_list_);
if ((status) && ((bidir_list_.size() % 2) == 0)) {
num_primary_flags++;
if (opt == 'K') {
req_concurrent_copy_bidir_ = REQ_CONCURRENT_COPY_BIDIR;
} else {
req_concurrent_copy_unidir_ = REQ_CONCURRENT_COPY_UNIDIR;
}
break;
}
print_help = true;
break;
// Size of buffers to use in copy and read/write operations // Size of buffers to use in copy and read/write operations
case 'm': case 'm':
status = ParseOptionValue(optarg, size_list_); status = ParseOptionValue(optarg, size_list_);
...@@ -420,7 +457,7 @@ void RocmBandwidthTest::ParseArguments() { ...@@ -420,7 +457,7 @@ void RocmBandwidthTest::ParseArguments() {
std::cout << "Argument is illegal or needs value: " << '?' << std::endl; std::cout << "Argument is illegal or needs value: " << '?' << std::endl;
if ((optopt == 'b') || (optopt == 's') || if ((optopt == 'b') || (optopt == 's') ||
(optopt == 'd') || (optopt == 'm') || (optopt == 'i')) { (optopt == 'd') || (optopt == 'm') || (optopt == 'i')) {
std::cout << "Error: Options -b -s -d and -m -i require argument" << std::endl; std::cout << "Error: Options -b -s -d -m -i -k and -K require argument" << std::endl;
} }
print_help = true; print_help = true;
break; break;
......
...@@ -309,6 +309,11 @@ void RocmBandwidthTest::PrintTransList() { ...@@ -309,6 +309,11 @@ void RocmBandwidthTest::PrintTransList() {
std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl; std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl;
std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl; std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl;
} }
if ((trans.req_type_ == REQ_CONCURRENT_COPY_BIDIR) ||
(trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)) {
std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl;
std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl;
}
} }
std::cout << std::endl; std::cout << std::endl;
......
...@@ -178,14 +178,18 @@ void RocmBandwidthTest::Display() const { ...@@ -178,14 +178,18 @@ void RocmBandwidthTest::Display() const {
} }
if ((req_copy_bidir_ == REQ_COPY_BIDIR) || if ((req_copy_bidir_ == REQ_COPY_BIDIR) ||
(req_copy_unidir_ == REQ_COPY_UNIDIR)) { (req_copy_unidir_ == REQ_COPY_UNIDIR) ||
(req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) ||
(req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) {
PrintVersion(); PrintVersion();
} }
for (uint32_t idx = 0; idx < trans_size; idx++) { for (uint32_t idx = 0; idx < trans_size; idx++) {
async_trans_t trans = trans_list_[idx]; async_trans_t trans = trans_list_[idx];
if ((trans.req_type_ == REQ_COPY_BIDIR) || if ((trans.req_type_ == REQ_COPY_BIDIR) ||
(trans.req_type_ == REQ_COPY_UNIDIR)) { (trans.req_type_ == REQ_COPY_UNIDIR) ||
(trans.req_type_ == REQ_CONCURRENT_COPY_BIDIR) ||
(trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)) {
DisplayCopyTime(trans); DisplayCopyTime(trans);
} }
if ((trans.req_type_ == REQ_READ) || if ((trans.req_type_ == REQ_READ) ||
......
...@@ -43,14 +43,22 @@ ...@@ -43,14 +43,22 @@
#include "common.hpp" #include "common.hpp"
#include "rocm_bandwidth_test.hpp" #include "rocm_bandwidth_test.hpp"
bool RocmBandwidthTest::FindMirrorRequest(uint32_t src_idx, uint32_t dst_idx) { bool RocmBandwidthTest::FindMirrorRequest(bool reverse,
uint32_t src_idx, uint32_t dst_idx) {
uint32_t size = trans_list_.size(); uint32_t size = trans_list_.size();
for (uint32_t idx = 0; idx < size; idx++) { for (uint32_t idx = 0; idx < size; idx++) {
async_trans_t& mirror = trans_list_[idx]; async_trans_t& mirror = trans_list_[idx];
if ((src_idx == mirror.copy.dst_idx_) && if(reverse) {
(dst_idx == mirror.copy.src_idx_)) { if ((src_idx == mirror.copy.dst_idx_) &&
return true; (dst_idx == mirror.copy.src_idx_)) {
return true;
}
} else {
if ((src_idx == mirror.copy.src_idx_) &&
(dst_idx == mirror.copy.dst_idx_)) {
return true;
}
} }
} }
...@@ -135,7 +143,6 @@ bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type, ...@@ -135,7 +143,6 @@ bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type,
vector<size_t>& src_list, vector<size_t>& src_list,
vector<size_t>& dst_list) { vector<size_t>& dst_list) {
// bool filter_out;
uint32_t src_size = src_list.size(); uint32_t src_size = src_list.size();
uint32_t dst_size = dst_list.size(); uint32_t dst_size = dst_list.size();
...@@ -172,7 +179,7 @@ bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type, ...@@ -172,7 +179,7 @@ bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type,
continue; continue;
} }
bool mirror = FindMirrorRequest(src_idx, dst_idx); bool mirror = FindMirrorRequest(true, src_idx, dst_idx);
if (mirror) { if (mirror) {
continue; continue;
} }
...@@ -226,6 +233,94 @@ bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type, ...@@ -226,6 +233,94 @@ bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type,
return true; return true;
} }
bool RocmBandwidthTest::BuildConcurrentCopyTrans(uint32_t req_type,
vector<size_t>& dev_list) {
uint32_t size = dev_list.size();
for (uint32_t idx = 0; idx < size; idx += 2) {
// Retrieve Roc runtime handles for Src memory pool and agents
uint32_t src_idx = dev_list[idx];
uint32_t src_dev_idx = pool_list_[src_idx].agent_index_;
hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_;
hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_;
// Retrieve Roc runtime handles for Dst memory pool and agents
uint32_t dst_idx = dev_list[idx + 1];
uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_;
hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_;
hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_;
// Filter out transactions that involve only Cpu agents/devices
// without regard to type of request, default run, partial or full
// unidirectional or bidirectional copies
if ((src_dev_type == HSA_DEVICE_TYPE_CPU) &&
(dst_dev_type == HSA_DEVICE_TYPE_CPU)) {
continue;
}
// Determine there is no duplicate
bool mirror = false;
mirror = FindMirrorRequest(false, src_idx, dst_idx);
if (mirror) {
continue;
}
// Filter out transactions that involve only same GPU as both
// Src and Dst device if the request is bidirectional copy that
// is either partial or full
if (req_type == REQ_CONCURRENT_COPY_BIDIR) {
if (src_dev_idx == dst_dev_idx) {
continue;
}
mirror = FindMirrorRequest(true, src_idx, dst_idx);
if (mirror) {
continue;
}
}
// Determine if accessibility to dst pool for src agent is not denied
uint32_t path_exists = access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx];
if (path_exists == 0) {
PrintCopyAccessError(src_idx, dst_idx);
return false;
}
// For bidirectional copies determine both access paths are valid
// Both paths are valid when one of the devices is a CPU. This is
// not true when both of the devices are GPU's.
if (req_type == REQ_CONCURRENT_COPY_BIDIR) {
path_exists = access_matrix_[(dst_dev_idx * agent_index_) + src_dev_idx];
if (path_exists == 0) {
PrintCopyAccessError(dst_idx, src_idx);
return false;
}
}
// Update the list of agents active in any copy operation
if (active_agents_list_ == NULL) {
active_agents_list_ = new uint32_t[agent_index_]();
}
active_agents_list_[src_dev_idx] = 1;
active_agents_list_[dst_dev_idx] = 1;
// Agents have access, build an instance of transaction
// and add it to the list of transactions
async_trans_t trans(req_type);
trans.copy.src_idx_ = src_idx;
trans.copy.dst_idx_ = dst_idx;
trans.copy.src_pool_ = src_pool;
trans.copy.dst_pool_ = dst_pool;
trans.copy.bidir_ = (req_type == REQ_CONCURRENT_COPY_BIDIR);
trans.copy.uses_gpu_ = ((src_dev_type == HSA_DEVICE_TYPE_GPU) ||
(dst_dev_type == HSA_DEVICE_TYPE_GPU));
trans_list_.push_back(trans);
}
return true;
}
bool RocmBandwidthTest::BuildBidirCopyTrans() { bool RocmBandwidthTest::BuildBidirCopyTrans() {
return BuildCopyTrans(REQ_COPY_BIDIR, bidir_list_, bidir_list_); return BuildCopyTrans(REQ_COPY_BIDIR, bidir_list_, bidir_list_);
} }
...@@ -246,63 +341,58 @@ bool RocmBandwidthTest::BuildAllPoolsUnidirCopyTrans() { ...@@ -246,63 +341,58 @@ bool RocmBandwidthTest::BuildAllPoolsUnidirCopyTrans() {
bool RocmBandwidthTest::BuildTransList() { bool RocmBandwidthTest::BuildTransList() {
// Build list of Read transactions per user request // Build list of Read transactions per user request
bool status = false;
if (req_read_ == REQ_READ) { if (req_read_ == REQ_READ) {
status = BuildReadTrans(); return BuildReadTrans();
if (status == false) {
return status;
}
} }
// Build list of Write transactions per user request // Build list of Write transactions per user request
status = false;
if (req_write_ == REQ_WRITE) { if (req_write_ == REQ_WRITE) {
status = BuildWriteTrans(); return BuildWriteTrans();
if (status == false) {
return status;
}
} }
// Build list of Bidirectional Copy transactions per user request // Build list of Bidirectional Copy transactions per user request
status = false;
if (req_copy_bidir_ == REQ_COPY_BIDIR) { if (req_copy_bidir_ == REQ_COPY_BIDIR) {
status = BuildBidirCopyTrans(); return BuildBidirCopyTrans();
if (status == false) {
return status;
}
} }
// Build list of Unidirectional Copy transactions per user request // Build list of Unidirectional Copy transactions per user request
status = false;
if (req_copy_unidir_ == REQ_COPY_UNIDIR) { if (req_copy_unidir_ == REQ_COPY_UNIDIR) {
status = BuildUnidirCopyTrans(); return BuildUnidirCopyTrans();
if (status == false) {
return status;
}
} }
// Build list of All Bidir Copy transactions per user request // Build list of All Bidir Copy transactions per user request
status = false;
if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) {
status = BuildAllPoolsBidirCopyTrans(); return BuildAllPoolsBidirCopyTrans();
if (status == false) {
return status;
}
} }
// Build list of All Unidir Copy transactions per user request // Build list of All Unidir Copy transactions per user request
status = false;
if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) { if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) {
status = BuildAllPoolsUnidirCopyTrans(); return BuildAllPoolsUnidirCopyTrans();
if (status == false) { }
return status;
} // Build list of Bidir Concurrent Copy transactions per user request
if (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) {
return BuildConcurrentCopyTrans(req_concurrent_copy_bidir_, bidir_list_);
}
// Build list of Unidir Concurrent Copy transactions per user request
if (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR) {
return BuildConcurrentCopyTrans(req_concurrent_copy_unidir_, bidir_list_);
} }
// All of the transaction are built up // All of the transaction are built up
return true; return true;
} }
void RocmBandwidthTest::ComputeCopyTime(std::vector<async_trans_t>& trans_list) {
uint32_t trans_cnt = trans_list.size();
for (uint32_t idx = 0; idx < trans_cnt; idx++) {
async_trans_t& trans = trans_list[idx];
ComputeCopyTime(trans);
}
}
void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) { void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) {
// Get the frequency of Gpu Timestamping // Get the frequency of Gpu Timestamping
......
...@@ -147,52 +147,50 @@ bool RocmBandwidthTest::ValidateUnidirCopyReq() { ...@@ -147,52 +147,50 @@ bool RocmBandwidthTest::ValidateUnidirCopyReq() {
return ((ValidateCopyReq(src_list_)) && (ValidateCopyReq(dst_list_))); return ((ValidateCopyReq(src_list_)) && (ValidateCopyReq(dst_list_)));
} }
bool RocmBandwidthTest::ValidateConcurrentCopyReq() {
// Determine every pool is present in system
return PoolIsPresent(bidir_list_);
}
bool RocmBandwidthTest::ValidateArguments() { bool RocmBandwidthTest::ValidateArguments() {
// Determine if user has requested a READ // Determine if user has requested a READ
// operation and gave valid inputs // operation and gave valid inputs
bool status = false;
if (req_read_ == REQ_READ) { if (req_read_ == REQ_READ) {
status = ValidateReadReq(); return ValidateReadReq();
if (status == false) {
return status;
}
} }
// Determine if user has requested a WRITE // Determine if user has requested a WRITE
// operation and gave valid inputs // operation and gave valid inputs
status = false;
if (req_write_ == REQ_WRITE) { if (req_write_ == REQ_WRITE) {
status = ValidateWriteReq(); return ValidateWriteReq();
if (status == false) {
return status;
}
} }
// Determine if user has requested a Copy // Determine if user has requested a Copy
// operation that is bidirectional and gave // operation that is bidirectional and gave
// valid inputs. Same validation is applied // valid inputs. Same validation is applied
// for all-to-all unidirectional copy operation // for all-to-all unidirectional copy operation
status = false;
if ((req_copy_bidir_ == REQ_COPY_BIDIR) || if ((req_copy_bidir_ == REQ_COPY_BIDIR) ||
(req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) {
status = ValidateBidirCopyReq(); return ValidateBidirCopyReq();
if (status == false) {
return status;
}
} }
// Determine if user has requested a Copy // Determine if user has requested a Copy
// operation that is unidirectional and gave // operation that is unidirectional and gave
// valid inputs. Same validation is applied // valid inputs. Same validation is applied
// for all-to-all bidirectional copy operation // for all-to-all bidirectional copy operation
status = false;
if ((req_copy_unidir_ == REQ_COPY_UNIDIR) || if ((req_copy_unidir_ == REQ_COPY_UNIDIR) ||
(req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) {
status = ValidateUnidirCopyReq(); return ValidateUnidirCopyReq();
if (status == false) { }
return status;
} // Determine if user has requested a Concurrent
// Copy operation that is unidirectional or bidirectional
// and gave valid inputs.
if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) ||
(req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) {
return ValidateConcurrentCopyReq();
} }
// All of the request are well formed // All of the request are well formed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment