rocm_bandwidth_test_topology.cpp 15.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
// 
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
// 
// Developed by:
// 
//                 AMD Research and AMD HSA Software Development
// 
//                 Advanced Micro Devices, Inc.
// 
//                 www.amd.com
// 
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
// 
//  - Redistributions of source code must retain the above copyright notice,
//    this list of conditions and the following disclaimers.
//  - Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimers in
//    the documentation and/or other materials provided with the distribution.
//  - Neither the names of Advanced Micro Devices, Inc,
//    nor the names of its contributors may be used to endorse or promote
//    products derived from this Software without specific prior written
//    permission.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////

#include "common.hpp"
#include "rocm_bandwidth_test.hpp"

46
47
48
#include <iomanip>
#include <sstream>
#include <string>
49
#include <cstring>
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// @brief: Helper method to iterate throught the memory pools of
// an agent and discover its properties
hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) {

  hsa_status_t status;
  RocmBandwidthTest* asyncDrvr = reinterpret_cast<RocmBandwidthTest*>(data);

  // Query pools' segment, report only pools from global segment
  hsa_amd_segment_t segment;
  status = hsa_amd_memory_pool_get_info(pool,
                   HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
  ErrorCheck(status);
  if (HSA_AMD_SEGMENT_GLOBAL != segment) {
    return HSA_STATUS_SUCCESS;
  }

  // Determine if allocation is allowed in this pool
  // Report only pools that allow an alloction by user
  bool alloc = false;
  status = hsa_amd_memory_pool_get_info(pool,
                   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc);
  ErrorCheck(status);
  if (alloc != true) {
    return HSA_STATUS_SUCCESS;
  }

  // Query the max allocatable size
  size_t max_size = 0;
  status = hsa_amd_memory_pool_get_info(pool,
                   HSA_AMD_MEMORY_POOL_INFO_SIZE, &max_size);
  ErrorCheck(status);

  // Determine if the pools is accessible to all agents
  bool access_to_all = false;
  status = hsa_amd_memory_pool_get_info(pool,
                HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_to_all);
  ErrorCheck(status);

  // Determine type of access to owner agent
  hsa_amd_memory_pool_access_t owner_access;
  hsa_agent_t agent = asyncDrvr->agent_list_.back().agent_;
  status = hsa_amd_agent_memory_pool_get_info(agent, pool,
                         HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &owner_access);
  ErrorCheck(status);

  // Determine if the pool is fine-grained or coarse-grained
  uint32_t flag = 0;
  status = hsa_amd_memory_pool_get_info(pool,
                   HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
  ErrorCheck(status);
  bool is_kernarg = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & flag);
  bool is_fine_grained = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & flag);
zhugzh1's avatar
zhugzh1 committed
103
  bool is_coarse_grained =(HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & flag);
zhugzh1's avatar
zhugzh1 committed
104
  bool is_ext_fine_grained = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED & flag);
105
106
107
108
109
110
  // Update the pool handle for system memory if kernarg is true
  if (is_kernarg) {
    asyncDrvr->sys_pool_ = pool;
  }

  // Consult user request and add either fine-grained or
111
112
  // coarse-grained memory pools if agent is CPU. Default
  // is to skip coarse-grained memory pools
113
114
  agent_info_t& agent_info = asyncDrvr->agent_list_.back();
  if (agent_info.device_type_ == HSA_DEVICE_TYPE_CPU) {
115
    if (asyncDrvr->skip_cpu_fine_grain_ != NULL) {
116
117
118
119
      if (is_fine_grained == true) {
        return HSA_STATUS_SUCCESS;
      }
    } else {
120
121
122
123
      // Skip pools that are one of the following:
      //    Coarse grained
      //    Fine grained with kernarg being false
      if ((is_fine_grained == false) || (is_kernarg == false)) {
124
125
126
127
        return HSA_STATUS_SUCCESS;
      }
    }
  }
128
129
130
131
132

  // Consult user request and add either fine-grained or
  // coarse-grained memory pools if agent is GPU. Default
  // is to skip fine-grained memory pools
  if (agent_info.device_type_ == HSA_DEVICE_TYPE_GPU) {
zhugzh1's avatar
zhugzh1 committed
133
134
135
    if (is_ext_fine_grained == true) {
      return HSA_STATUS_SUCCESS;
    }
136
137
138
139
140
    if (asyncDrvr->skip_gpu_coarse_grain_ != NULL) {
      if (is_fine_grained == false) {
        return HSA_STATUS_SUCCESS;
      }
    } else {
zhugzh1's avatar
zhugzh1 committed
141
      if (!is_coarse_grained) {
142
143
144
145
        return HSA_STATUS_SUCCESS;
      }
    }
  }
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

  // Create an instance of agent_pool_info and add it to the list
  pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool,
                        segment, max_size, asyncDrvr->pool_index_,
                        is_fine_grained, is_kernarg,
                        access_to_all, owner_access);
  asyncDrvr->pool_list_.push_back(pool_info);

  // Create an agent_pool_infot and add it to its list
  asyncDrvr->agent_pool_list_[asyncDrvr->agent_index_].pool_list.push_back(pool_info);
  asyncDrvr->pool_index_++;

  return HSA_STATUS_SUCCESS;
}

161
162
163
164
165
166
167
168
void PopulateBDF(uint32_t bdf_id, agent_info_t *agent_info) {

  uint8_t func_id = (bdf_id & 0x00000003);
  uint8_t dev_id = ((bdf_id & 0x000000F8) >> 3);
  uint8_t bus_id = ((bdf_id & 0x0000FF00) >> 8);
  std::stringstream stream;
  stream << std::setfill('0') << std::setw(sizeof(uint8_t) * 2);
  stream << std::hex << +bus_id << ":" << +dev_id << "." << +func_id;
169
  std::strcpy(agent_info->bdf_id_, (stream.str()).c_str());
170
171
}

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
// @brief: Helper method to iterate throught the agents of
// a system and discover its properties
hsa_status_t AgentInfo(hsa_agent_t agent, void* data) {

  RocmBandwidthTest* asyncDrvr = reinterpret_cast<RocmBandwidthTest*>(data);

  // Get the name of the agent
  char agent_name[64];
  hsa_status_t status;
  status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_name);
  ErrorCheck(status);

  // Get device type
  hsa_device_type_t device_type;
  status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
  ErrorCheck(status);

  // Capture the handle of Cpu agent
  if (device_type == HSA_DEVICE_TYPE_CPU) {
    asyncDrvr->cpu_agent_ = agent;
    asyncDrvr->cpu_index_ = asyncDrvr->agent_index_;
  }

  // Instantiate an instance of agent_info_t and populate its name
196
  // and BDF fields before adding it to the list of agent_info_t objects
197
198
199
200
  agent_info_t agent_info(agent, asyncDrvr->agent_index_, device_type);
  status = hsa_agent_get_info(agent,
                      (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME,
                      (void *)&agent_info.name_[0]);
201
202
203
204
  
  // Aqcuire GPU specific properties
  //    - BDF (a 32-bit integer)
  //    - UUID (a 21 char string including nil)
205
  if (device_type == HSA_DEVICE_TYPE_GPU) {
206
207
208
    status = hsa_agent_get_info(agent,
                      (hsa_agent_info_t)HSA_AMD_AGENT_INFO_UUID,
                      agent_info.uuid_);
209
210
211
212
213
214
    uint32_t bdf_id = 0;
    status = hsa_agent_get_info(agent,
                      (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID,
                      (void *)&bdf_id);
    PopulateBDF(bdf_id, &agent_info);
  }
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
  asyncDrvr->agent_list_.push_back(agent_info);

  // Contruct an new agent_pool_info structure and add it to the list
  agent_pool_info node;
  node.agent = asyncDrvr->agent_list_.back();
  asyncDrvr->agent_pool_list_.push_back(node);

  status = hsa_amd_agent_iterate_memory_pools(agent, MemPoolInfo, asyncDrvr);
  asyncDrvr->agent_index_++;

  return HSA_STATUS_SUCCESS;
}

void RocmBandwidthTest::PopulateAccessMatrix() {

  // Allocate memory to hold access lists
  access_matrix_ = new uint32_t[agent_index_ * agent_index_]();
232
  direct_access_matrix_ = new uint32_t[agent_index_ * agent_index_]();
233
234
235
236
237

  hsa_status_t status;
  uint32_t size = pool_list_.size();
  for (uint32_t src_idx = 0; src_idx < size; src_idx++) {

238
    // Get handle of Src agent of the pool
239
240
241
    uint32_t src_dev_idx = pool_list_[src_idx].agent_index_;
    hsa_agent_t src_agent = pool_list_[src_idx].owner_agent_;
    hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_;
242
    hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_;
243
244
245

    for (uint32_t dst_idx = 0; dst_idx < size; dst_idx++) {

246
      // Get handle of Dst pool
247
248
249
      uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_;
      hsa_agent_t dst_agent = pool_list_[dst_idx].owner_agent_;
      hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_;
250
      hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_;
251

252
253
      // Determine if src agent has access to dst pool
      hsa_amd_memory_pool_access_t access;
254
      status = hsa_amd_agent_memory_pool_get_info(src_agent, dst_pool,
255
                             HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
256
      ErrorCheck(status);
257
258
259
260
261
      
      // Record if Src device can access or not
      uint32_t path;
      path = (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? 0 : 1;
      direct_access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx] = path;
262
      
263
264
265
266
267
268
      if ((src_dev_type == HSA_DEVICE_TYPE_CPU) &&
          (dst_dev_type == HSA_DEVICE_TYPE_GPU) &&
          (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)) {
        status = hsa_amd_agent_memory_pool_get_info(dst_agent, src_pool,
                             HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
        ErrorCheck(status);
269
270
      }

271
272
273
      // Access between the two agents is Non-Existent
      path = (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? 0 : 1;
      access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx] = path;
274
275
276
277
278
279
280
281
282
    }
  }
}

void RocmBandwidthTest::DiscoverTopology() {

  // Populate the lists of agents and pools
  err_ = hsa_iterate_agents(AgentInfo, this);

283
284
  // Populate the access, link type and weight matrices
  // Access matrix must be populated first
285
  PopulateAccessMatrix();
286
  DiscoverLinkProps();
287
288
}

289
290
291
uint32_t GetLinkType(hsa_device_type_t src_dev_type,
                     hsa_device_type_t dst_dev_type,
                     hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops) {
292
  
293
294
295
296
297
298
  // Link type is ignored, linkinfo is illegal
  // Currently Thunk collapses multi-hop paths into one
  // while accumulating their numa weight
  // @note: Thunk retains the original link type
  if (hops != 1) {
    return RocmBandwidthTest::LINK_TYPE_IGNORED;
299
300
  }
  
301
302
303
  // Return link type only if it specified as XGMI
  if ((link_info[0]).link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
    return RocmBandwidthTest::LINK_TYPE_XGMI;
304
305
  }

306
307
308
309
310
311
  // In this case all we know is there is a path involving
  // one or more links. Since it binding either two GPU's or
  // one Gpu and one Cpu, we infer it to be of type PCIe
  if ((src_dev_type == HSA_DEVICE_TYPE_GPU) ||
      (dst_dev_type == HSA_DEVICE_TYPE_GPU)) {
    return RocmBandwidthTest::LINK_TYPE_PCIE;
312
  }
313
314
315

  // This occurs when both devices are CPU's
  return RocmBandwidthTest::LINK_TYPE_IGNORED;
316
317
}

318
uint32_t GetLinkWeight(hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops) {
319

320
321
322
  uint32_t weight = 0;
  for(uint32_t hopIdx = 0; hopIdx < hops; hopIdx++) {
    weight += (link_info[hopIdx]).numa_distance;
323
  }
324
  return weight;
325
326
}

327
void RocmBandwidthTest::BindLinkProps(uint32_t idx1, uint32_t idx2) {
328
329
330
  
  // Agent has no pools so no need to look for numa distance
  if (agent_pool_list_[idx2].pool_list.size() == 0) {
331
    link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF;
332
    link_weight_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF;
333
    link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_NO_PATH;
334
335
336
337
338
339
340
341
342
    return;
  }
  
  uint32_t hops = 0;
  hsa_agent_t agent1 = agent_list_[idx1].agent_;
  hsa_amd_memory_pool_t& pool = agent_pool_list_[idx2].pool_list[0].pool_;
  err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool,
                   HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &hops);
  if (hops < 1) {
343
    link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF;
344
    link_weight_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF;
345
    link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_NO_PATH;
346
347
348
349
350
351
    return;
  }

  hsa_amd_memory_pool_link_info_t *link_info;
  uint32_t link_info_sz = hops * sizeof(hsa_amd_memory_pool_link_info_t);
  link_info = (hsa_amd_memory_pool_link_info_t *)malloc(link_info_sz);
352
  std::memset(link_info, 0, (hops * sizeof(hsa_amd_memory_pool_link_info_t)));
353
354
  err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool,
                 HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info);
355
356
357
358
359
360
361
362
363
364
365
366


  link_hops_matrix_[(idx1 * agent_index_) + idx2] = hops;
  link_weight_matrix_[(idx1 * agent_index_) + idx2] = GetLinkWeight(link_info, hops);
  
  // Initialize link type based on Src and Dst devices plus link
  // type reported by ROCr library
  hsa_device_type_t src_dev_type = agent_list_[idx1].device_type_;
  hsa_device_type_t dst_dev_type = agent_list_[idx2].device_type_;
  link_type_matrix_[(idx1 * agent_index_) + idx2] = GetLinkType(src_dev_type,
                                                                dst_dev_type, link_info, hops);
  // Free the allocated link block
367
368
369
  free(link_info); 
}

370
void RocmBandwidthTest::DiscoverLinkProps() {
371
372

  // Allocate space if it is first time
373
  if (link_weight_matrix_ == NULL) {
374
375
    link_type_matrix_ = new uint32_t[agent_index_ * agent_index_]();
    link_hops_matrix_ = new uint32_t[agent_index_ * agent_index_]();
376
    link_weight_matrix_ = new uint32_t[agent_index_ * agent_index_]();
377
378
379
380
381
382
  }

  agent_info_t agent_info;
  for (uint32_t idx1 = 0; idx1 < agent_index_; idx1++) {
    for (uint32_t idx2 = 0; idx2 < agent_index_; idx2++) {
      if (idx1 == idx2) {
383
        link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0;
384
        link_weight_matrix_[(idx1 *agent_index_) + idx2] = 0;
385
        link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_SELF;
386
387
        continue;
      }
388
      BindLinkProps(idx1, idx2);
389
390
391
392
    }
  }
}