TransferBench.cpp 90.5 KB
Newer Older
Gilbert Lee's avatar
Gilbert Lee committed
1
/*
gilbertlee-amd's avatar
gilbertlee-amd committed
2
Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
Gilbert Lee's avatar
Gilbert Lee committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

// This program measures simultaneous copy performance across multiple GPUs
// on the same node
25
26
#include <numa.h>     // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <cmath>      // If not found, try installing g++-12      (e.g apt-get install g++-12)
Gilbert Lee's avatar
Gilbert Lee committed
27
#include <numaif.h>
Gilbert Lee's avatar
Gilbert Lee committed
28
#include <random>
Gilbert Lee's avatar
Gilbert Lee committed
29
30
31
32
33
34
35
36
#include <stack>
#include <thread>

#include "TransferBench.hpp"
#include "GetClosestNumaNode.hpp"

int main(int argc, char **argv)
{
Gilbert Lee's avatar
Gilbert Lee committed
37
38
39
40
41
42
43
  // Check for NUMA library support
  if (numa_available() == -1)
  {
    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
    exit(1);
  }

Gilbert Lee's avatar
Gilbert Lee committed
44
45
46
47
48
49
50
51
52
53
54
55
  // Display usage instructions and detected topology
  if (argc <= 1)
  {
    int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0);
    if (!outputToCsv) DisplayUsage(argv[0]);
    DisplayTopology(outputToCsv);
    exit(0);
  }

  // Collect environment variables / display current run configuration
  EnvVars ev;

Gilbert Lee's avatar
Gilbert Lee committed
56
57
  // Determine number of bytes to run per Transfer
  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
Gilbert Lee's avatar
Gilbert Lee committed
58
59
60
61
62
63
  if (argc > 2)
  {
    // Adjust bytes if unit specified
    char units = argv[2][strlen(argv[2])-1];
    switch (units)
    {
Gilbert Lee's avatar
Gilbert Lee committed
64
65
66
    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
Gilbert Lee's avatar
Gilbert Lee committed
67
68
    }
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
69
70
71
72
73
  if (numBytesPerTransfer % 4)
  {
    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
    exit(1);
  }
Gilbert Lee's avatar
Gilbert Lee committed
74

Gilbert Lee's avatar
Gilbert Lee committed
75
76
77
78
  // Check for preset tests
  // - Tests that sweep across possible sets of Transfers
  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
79
80
    int numGpuSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
    int numCpuSubExecs = (argc > 4 ? atoi(argv[4]) : 4);
gilbertlee-amd's avatar
gilbertlee-amd committed
81

82
    ev.configMode = CFG_SWEEP;
gilbertlee-amd's avatar
gilbertlee-amd committed
83
    RunSweepPreset(ev, numBytesPerTransfer, numGpuSubExecs, numCpuSubExecs, !strcmp(argv[1], "rsweep"));
Gilbert Lee's avatar
Gilbert Lee committed
84
85
86
    exit(0);
  }
  // - Tests that benchmark peer-to-peer performance
gilbertlee-amd's avatar
gilbertlee-amd committed
87
  else if (!strcmp(argv[1], "p2p"))
Gilbert Lee's avatar
Gilbert Lee committed
88
  {
89
    ev.configMode = CFG_P2P;
gilbertlee-amd's avatar
gilbertlee-amd committed
90
    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
Gilbert Lee's avatar
Gilbert Lee committed
91
92
    exit(0);
  }
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  // - Test SubExecutor scaling
  else if (!strcmp(argv[1], "scaling"))
  {
    int maxSubExecs = (argc > 3 ? atoi(argv[3]) : 32);
    int exeIndex    = (argc > 4 ? atoi(argv[4]) : 0);

    if (exeIndex >= ev.numGpuDevices)
    {
      printf("[ERROR] Cannot execute scaling test with GPU device %d\n", exeIndex);
      exit(1);
    }
    ev.configMode = CFG_SCALE;
    RunScalingBenchmark(ev, numBytesPerTransfer / sizeof(float), exeIndex, maxSubExecs);
    exit(0);
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
108
109
110
111
112
113
114
115
116
117
118
  // - Test all2all benchmark
  else if (!strcmp(argv[1], "a2a"))
  {
    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);

    // Force single-stream mode for all-to-all benchmark
    ev.useSingleStream = 1;
    ev.configMode = CFG_A2A;
    RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
    exit(0);
  }
119
120
121
122
123
124
125
126
127
128
129
130
  // - Test schmoo benchmark
  else if (!strcmp(argv[1], "schmoo"))
  {
    if (ev.numGpuDevices < 2)
    {
      printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
      exit(1);
    }
    ev.configMode = CFG_SCHMOO;

    int localIdx    = (argc > 3 ? atoi(argv[3]) : 0);
    int remoteIdx   = (argc > 4 ? atoi(argv[4]) : 1);
131
    int maxSubExecs = (argc > 5 ? atoi(argv[5]) : 32);
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

    if (localIdx >= ev.numGpuDevices || remoteIdx >= ev.numGpuDevices)
    {
      printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
      exit(1);
    }
    ev.DisplaySchmooEnvVars();

    for (int N = 256; N <= (1<<27); N *= 2)
    {
      int delta = std::max(1, N / ev.samplingFactor);
      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
      do
      {
        RunSchmooBenchmark(ev, curr * sizeof(float), localIdx, remoteIdx, maxSubExecs);
        if (numBytesPerTransfer != 0) exit(0);
        curr += delta;
      } while (curr < N * 2);
    }
  }
152
153
154
155
156
157
158
159
160
161
162
163
164
165
  else if (!strcmp(argv[1], "cmdline"))
  {
    // Print environment variables and CSV header
    ev.DisplayEnvVars();
    if (ev.outputToCsv)
    {
      printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
    }

    // Read Transfer from command line
    std::string cmdlineTransfer;
    for (int i = 3; i < argc; i++)
      cmdlineTransfer += std::string(argv[i]) + " ";

166
    char line[MAX_LINE_LEN];
167
168
    sprintf(line, "%s", cmdlineTransfer.c_str());
    std::vector<Transfer> transfers;
169
    ParseTransfers(ev, line, transfers);
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
    if (transfers.empty()) exit(0);

    // If the number of bytes is specified, use it
    if (numBytesPerTransfer != 0)
    {
      size_t N = numBytesPerTransfer / sizeof(float);
      ExecuteTransfers(ev, 1, N, transfers);
    }
    else
    {
      // Otherwise generate a range of values
      for (int N = 256; N <= (1<<27); N *= 2)
      {
        int delta = std::max(1, N / ev.samplingFactor);
        int curr = N;
        while (curr < N * 2)
        {
          ExecuteTransfers(ev, 1, curr, transfers);
          curr += delta;
        }
      }
    }
    exit(0);
  }
Gilbert Lee's avatar
Gilbert Lee committed
194

Gilbert Lee's avatar
Gilbert Lee committed
195
  // Check that Transfer configuration file can be opened
196
  ev.configMode = CFG_FILE;
Gilbert Lee's avatar
Gilbert Lee committed
197
198
199
  FILE* fp = fopen(argv[1], "r");
  if (!fp)
  {
Gilbert Lee's avatar
Gilbert Lee committed
200
    printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
Gilbert Lee's avatar
Gilbert Lee committed
201
202
203
    exit(1);
  }

Gilbert Lee's avatar
Gilbert Lee committed
204
  // Print environment variables and CSV header
Gilbert Lee's avatar
Gilbert Lee committed
205
206
207
  ev.DisplayEnvVars();
  if (ev.outputToCsv)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
208
    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
Gilbert Lee's avatar
Gilbert Lee committed
209
210
211
  }

  int testNum = 0;
212
213
  char line[MAX_LINE_LEN];
  while(fgets(line, MAX_LINE_LEN, fp))
Gilbert Lee's avatar
Gilbert Lee committed
214
215
216
217
  {
    // Check if line is a comment to be echoed to output (starts with ##)
    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);

Gilbert Lee's avatar
Gilbert Lee committed
218
219
    // Parse set of parallel Transfers to execute
    std::vector<Transfer> transfers;
220
    ParseTransfers(ev, line, transfers);
Gilbert Lee's avatar
Gilbert Lee committed
221
    if (transfers.empty()) continue;
Gilbert Lee's avatar
Gilbert Lee committed
222

gilbertlee-amd's avatar
gilbertlee-amd committed
223
224
225
226
227
228
229
230
231
232
233
    // If the number of bytes is specified, use it
    if (numBytesPerTransfer != 0)
    {
      size_t N = numBytesPerTransfer / sizeof(float);
      ExecuteTransfers(ev, ++testNum, N, transfers);
    }
    else
    {
      // Otherwise generate a range of values
      for (int N = 256; N <= (1<<27); N *= 2)
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
234
        int delta = std::max(1, N / ev.samplingFactor);
gilbertlee-amd's avatar
gilbertlee-amd committed
235
236
237
        int curr = N;
        while (curr < N * 2)
        {
gilbertlee-amd's avatar
gilbertlee-amd committed
238
          ExecuteTransfers(ev, ++testNum, curr, transfers);
gilbertlee-amd's avatar
gilbertlee-amd committed
239
240
241
242
          curr += delta;
        }
      }
    }
Gilbert Lee's avatar
Gilbert Lee committed
243
244
  }
  fclose(fp);
Gilbert Lee's avatar
Gilbert Lee committed
245

Gilbert Lee's avatar
Gilbert Lee committed
246
247
  return 0;
}
Gilbert Lee's avatar
Gilbert Lee committed
248

Gilbert Lee's avatar
Gilbert Lee committed
249
void ExecuteTransfers(EnvVars const& ev,
gilbertlee-amd's avatar
gilbertlee-amd committed
250
251
252
                      int const testNum,
                      size_t const N,
                      std::vector<Transfer>& transfers,
gilbertlee-amd's avatar
gilbertlee-amd committed
253
254
                      bool verbose,
                      double* totalBandwidthCpu)
Gilbert Lee's avatar
Gilbert Lee committed
255
256
{
  int const initOffset = ev.byteOffset / sizeof(float);
Gilbert Lee's avatar
Gilbert Lee committed
257

Gilbert Lee's avatar
Gilbert Lee committed
258
259
  // Map transfers by executor
  TransferMap transferMap;
gilbertlee-amd's avatar
gilbertlee-amd committed
260
  for (int i = 0; i < transfers.size(); i++)
Gilbert Lee's avatar
Gilbert Lee committed
261
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
262
263
    Transfer& transfer = transfers[i];
    transfer.transferIndex = i;
gilbertlee-amd's avatar
gilbertlee-amd committed
264
    Executor executor(transfer.exeType, transfer.exeIndex);
Gilbert Lee's avatar
Gilbert Lee committed
265
    ExecutorInfo& executorInfo = transferMap[executor];
gilbertlee-amd's avatar
gilbertlee-amd committed
266
    executorInfo.transfers.push_back(&transfer);
Gilbert Lee's avatar
Gilbert Lee committed
267
  }
Gilbert Lee's avatar
Gilbert Lee committed
268

gilbertlee-amd's avatar
gilbertlee-amd committed
269
  // Loop over each executor and prepare sub-executors
gilbertlee-amd's avatar
gilbertlee-amd committed
270
  std::map<int, Transfer*> transferList;
Gilbert Lee's avatar
Gilbert Lee committed
271
272
273
  for (auto& exeInfoPair : transferMap)
  {
    Executor const& executor = exeInfoPair.first;
gilbertlee-amd's avatar
gilbertlee-amd committed
274
275
276
277
    ExecutorInfo& exeInfo    = exeInfoPair.second;
    ExeType const exeType    = executor.first;
    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));

Gilbert Lee's avatar
Gilbert Lee committed
278
    exeInfo.totalTime = 0.0;
gilbertlee-amd's avatar
gilbertlee-amd committed
279
    exeInfo.totalSubExecs = 0;
Gilbert Lee's avatar
Gilbert Lee committed
280
281

    // Loop over each transfer this executor is involved in
gilbertlee-amd's avatar
gilbertlee-amd committed
282
    for (Transfer* transfer : exeInfo.transfers)
Gilbert Lee's avatar
Gilbert Lee committed
283
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
284
285
286
287
288
289
      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));

      // Allocate source memory
      transfer->srcMem.resize(transfer->numSrcs);
      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
Gilbert Lee's avatar
Gilbert Lee committed
290
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
291
292
293
        MemType const& srcType  = transfer->srcType[iSrc];
        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));

Gilbert Lee's avatar
Gilbert Lee committed
294
        // Ensure executing GPU can access source memory
295
        if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
Gilbert Lee's avatar
Gilbert Lee committed
296
          EnablePeerAccess(exeIndex, srcIndex);
Gilbert Lee's avatar
Gilbert Lee committed
297

gilbertlee-amd's avatar
gilbertlee-amd committed
298
299
300
301
302
303
304
305
306
307
        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
      }

      // Allocate destination memory
      transfer->dstMem.resize(transfer->numDsts);
      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
      {
        MemType const& dstType  = transfer->dstType[iDst];
        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));

Gilbert Lee's avatar
Gilbert Lee committed
308
        // Ensure executing GPU can access destination memory
309
        if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
Gilbert Lee's avatar
Gilbert Lee committed
310
311
          EnablePeerAccess(exeIndex, dstIndex);

gilbertlee-amd's avatar
gilbertlee-amd committed
312
313
        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
      }
Gilbert Lee's avatar
Gilbert Lee committed
314

gilbertlee-amd's avatar
gilbertlee-amd committed
315
      exeInfo.totalSubExecs += transfer->numSubExecs;
gilbertlee-amd's avatar
gilbertlee-amd committed
316
      transferList[transfer->transferIndex] = transfer;
Gilbert Lee's avatar
Gilbert Lee committed
317
318
    }

gilbertlee-amd's avatar
gilbertlee-amd committed
319
320
    // Prepare additional requirement for GPU-based executors
    if (IsGpuType(exeType))
Gilbert Lee's avatar
Gilbert Lee committed
321
    {
322
323
      HIP_CALL(hipSetDevice(exeIndex));

gilbertlee-amd's avatar
gilbertlee-amd committed
324
325
326
327
328
329
      // Single-stream is only supported for GFX-based executors
      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
      exeInfo.streams.resize(numStreamsToUse);
      exeInfo.startEvents.resize(numStreamsToUse);
      exeInfo.stopEvents.resize(numStreamsToUse);
      for (int i = 0; i < numStreamsToUse; ++i)
Gilbert Lee's avatar
Gilbert Lee committed
330
      {
331
332
333
334
335
336
337
338
339
340
        if (ev.cuMask.size())
        {
#if !defined(__NVCC__)
          HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
#endif
        }
        else
        {
          HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
        }
Gilbert Lee's avatar
Gilbert Lee committed
341
342
343
        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
      }
Gilbert Lee's avatar
Gilbert Lee committed
344

gilbertlee-amd's avatar
gilbertlee-amd committed
345
      if (exeType == EXE_GPU_GFX)
Gilbert Lee's avatar
Gilbert Lee committed
346
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
347
348
        // Allocate one contiguous chunk of GPU memory for threadblock parameters
        // This allows support for executing one transfer per stream, or all transfers in a single stream
349
#if !defined(__NVCC__)
gilbertlee-amd's avatar
gilbertlee-amd committed
350
351
        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
                       (void**)&exeInfo.subExecParamGpu);
352
353
354
355
#else
        AllocateMemory(MEM_CPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
                       (void**)&exeInfo.subExecParamGpu);
#endif
Gilbert Lee's avatar
Gilbert Lee committed
356
357
358
      }
    }
  }
Gilbert Lee's avatar
Gilbert Lee committed
359

gilbertlee-amd's avatar
gilbertlee-amd committed
360
361
362
  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);

  // Prepare input memory and block parameters for current N
363
  bool isSrcCorrect = true;
gilbertlee-amd's avatar
gilbertlee-amd committed
364
  for (auto& exeInfoPair : transferMap)
Gilbert Lee's avatar
Gilbert Lee committed
365
  {
366
367
368
369
370
    Executor const& executor = exeInfoPair.first;
    ExecutorInfo& exeInfo    = exeInfoPair.second;
    ExeType const exeType    = executor.first;
    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));

gilbertlee-amd's avatar
gilbertlee-amd committed
371
372
    exeInfo.totalBytes = 0;
    for (int i = 0; i < exeInfo.transfers.size(); ++i)
Gilbert Lee's avatar
Gilbert Lee committed
373
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
374
375
      // Prepare subarrays each threadblock works on and fill src memory with patterned data
      Transfer* transfer = exeInfo.transfers[i];
gilbertlee-amd's avatar
gilbertlee-amd committed
376
      transfer->PrepareSubExecParams(ev);
377
      isSrcCorrect &= transfer->PrepareSrc(ev);
gilbertlee-amd's avatar
gilbertlee-amd committed
378
      exeInfo.totalBytes += transfer->numBytesActual;
379
380
381
382
383
384
    }

    // Copy block parameters to GPU for GPU executors
    if (exeType == EXE_GPU_GFX)
    {
      std::vector<SubExecParam> tempSubExecParam;
Gilbert Lee's avatar
Gilbert Lee committed
385

386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
      if (!ev.useSingleStream || (ev.blockOrder == ORDER_SEQUENTIAL))
      {
        // Assign Transfers to sequentual threadblocks
        int transferOffset = 0;
        for (Transfer* transfer : exeInfo.transfers)
        {
          transfer->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;

          transfer->subExecIdx.clear();
          for (int subExecIdx = 0; subExecIdx < transfer->subExecParam.size(); subExecIdx++)
          {
            transfer->subExecIdx.push_back(transferOffset + subExecIdx);
            tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
          }
          transferOffset += transfer->numSubExecs;
        }
      }
      else if (ev.blockOrder == ORDER_INTERLEAVED)
      {
        // Interleave threadblocks of different Transfers
        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
        for (int subExecIdx = 0; tempSubExecParam.size() < exeInfo.totalSubExecs; ++subExecIdx)
        {
          for (Transfer* transfer : exeInfo.transfers)
          {
            if (subExecIdx < transfer->numSubExecs)
            {
              transfer->subExecIdx.push_back(tempSubExecParam.size());
              tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
            }
          }
        }
      }
      else if (ev.blockOrder == ORDER_RANDOM)
Gilbert Lee's avatar
Gilbert Lee committed
420
      {
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
        std::vector<std::pair<int,int>> indices;
        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;

        // Build up a list of (transfer,subExecParam) indices, then randomly sort them
        for (int i = 0; i < exeInfo.transfers.size(); i++)
        {
          Transfer* transfer = exeInfo.transfers[i];
          for (int subExecIdx = 0; subExecIdx < transfer->numSubExecs; subExecIdx++)
            indices.push_back(std::make_pair(i, subExecIdx));
        }
        std::shuffle(indices.begin(), indices.end(), *ev.generator);

        // Build randomized threadblock list
        for (auto p : indices)
        {
          Transfer* transfer = exeInfo.transfers[p.first];
          transfer->subExecIdx.push_back(tempSubExecParam.size());
          tempSubExecParam.push_back(transfer->subExecParam[p.second]);
        }
Gilbert Lee's avatar
Gilbert Lee committed
440
      }
441
442
443
444
445
446
447

      HIP_CALL(hipSetDevice(exeIndex));
      HIP_CALL(hipMemcpy(exeInfo.subExecParamGpu,
                         tempSubExecParam.data(),
                         tempSubExecParam.size() * sizeof(SubExecParam),
                         hipMemcpyDefault));
      HIP_CALL(hipDeviceSynchronize());
Gilbert Lee's avatar
Gilbert Lee committed
448
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
449
  }
Gilbert Lee's avatar
Gilbert Lee committed
450

gilbertlee-amd's avatar
gilbertlee-amd committed
451
452
453
454
  // Launch kernels (warmup iterations are not counted)
  double totalCpuTime = 0;
  size_t numTimedIterations = 0;
  std::stack<std::thread> threads;
455
  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
gilbertlee-amd's avatar
gilbertlee-amd committed
456
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
457
    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
gilbertlee-amd's avatar
gilbertlee-amd committed
458
    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
Gilbert Lee's avatar
Gilbert Lee committed
459

gilbertlee-amd's avatar
gilbertlee-amd committed
460
461
    // Pause before starting first timed iteration in interactive mode
    if (verbose && ev.useInteractive && iteration == 0)
Gilbert Lee's avatar
Gilbert Lee committed
462
    {
463
464
465
466
      printf("Memory prepared:\n");

      for (Transfer& transfer : transfers)
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
467
468
469
470
471
        printf("Transfer %03d:\n", transfer.transferIndex);
        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
472
      }
gilbertlee-amd's avatar
gilbertlee-amd committed
473
      printf("Hit <Enter> to continue: ");
474
475
476
477
478
      if (scanf("%*c") != 0)
      {
        printf("[ERROR] Unexpected input\n");
        exit(1);
      }
Gilbert Lee's avatar
Gilbert Lee committed
479
480
      printf("\n");
    }
Gilbert Lee's avatar
Gilbert Lee committed
481

gilbertlee-amd's avatar
gilbertlee-amd committed
482
483
484
485
486
    // Start CPU timing for this iteration
    auto cpuStart = std::chrono::high_resolution_clock::now();

    // Execute all Transfers in parallel
    for (auto& exeInfoPair : transferMap)
487
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
488
      ExecutorInfo& exeInfo = exeInfoPair.second;
gilbertlee-amd's avatar
gilbertlee-amd committed
489
490
491
      ExeType       exeType = exeInfoPair.first.first;
      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();

gilbertlee-amd's avatar
gilbertlee-amd committed
492
493
      for (int i = 0; i < numTransfersToRun; ++i)
        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
494
    }
Gilbert Lee's avatar
Gilbert Lee committed
495

gilbertlee-amd's avatar
gilbertlee-amd committed
496
497
498
499
500
501
502
    // Wait for all threads to finish
    int const numTransfers = threads.size();
    for (int i = 0; i < numTransfers; i++)
    {
      threads.top().join();
      threads.pop();
    }
Gilbert Lee's avatar
Gilbert Lee committed
503

gilbertlee-amd's avatar
gilbertlee-amd committed
504
505
506
507
    // Stop CPU timing for this iteration
    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();

508
509
510
511
512
513
514
515
516
    if (ev.alwaysValidate)
    {
      for (auto transferPair : transferList)
      {
        Transfer* transfer = transferPair.second;
        transfer->ValidateDst(ev);
      }
    }

gilbertlee-amd's avatar
gilbertlee-amd committed
517
    if (iteration >= 0)
Gilbert Lee's avatar
Gilbert Lee committed
518
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
519
520
521
522
      ++numTimedIterations;
      totalCpuTime += deltaSec;
    }
  }
Gilbert Lee's avatar
Gilbert Lee committed
523

gilbertlee-amd's avatar
gilbertlee-amd committed
524
  // Pause for interactive mode
525
  if (verbose && isSrcCorrect && ev.useInteractive)
gilbertlee-amd's avatar
gilbertlee-amd committed
526
527
  {
    printf("Transfers complete. Hit <Enter> to continue: ");
528
529
530
531
532
    if (scanf("%*c") != 0)
    {
      printf("[ERROR] Unexpected input\n");
      exit(1);
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
533
534
    printf("\n");
  }
Gilbert Lee's avatar
Gilbert Lee committed
535

gilbertlee-amd's avatar
gilbertlee-amd committed
536
537
538
539
540
541
  // Validate that each transfer has transferred correctly
  size_t totalBytesTransferred = 0;
  int const numTransfers = transferList.size();
  for (auto transferPair : transferList)
  {
    Transfer* transfer = transferPair.second;
gilbertlee-amd's avatar
gilbertlee-amd committed
542
543
    transfer->ValidateDst(ev);
    totalBytesTransferred += transfer->numBytesActual;
gilbertlee-amd's avatar
gilbertlee-amd committed
544
  }
Gilbert Lee's avatar
Gilbert Lee committed
545

gilbertlee-amd's avatar
gilbertlee-amd committed
546
547
548
  // Report timings
  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
gilbertlee-amd's avatar
gilbertlee-amd committed
549
550
  if (totalBandwidthCpu) *totalBandwidthCpu = totalBandwidthGbs;

gilbertlee-amd's avatar
gilbertlee-amd committed
551
  double maxGpuTime = 0;
Gilbert Lee's avatar
Gilbert Lee committed
552

553
  if (!isSrcCorrect) goto cleanup;
gilbertlee-amd's avatar
gilbertlee-amd committed
554
555
556
557
  if (ev.useSingleStream)
  {
    for (auto& exeInfoPair : transferMap)
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
558
559
560
      ExecutorInfo  exeInfo  = exeInfoPair.second;
      ExeType const exeType  = exeInfoPair.first.first;
      int     const exeIndex = exeInfoPair.first.second;
Gilbert Lee's avatar
Gilbert Lee committed
561

gilbertlee-amd's avatar
gilbertlee-amd committed
562
563
      // Compute total time for non GPU executors
      if (exeType != EXE_GPU_GFX)
gilbertlee-amd's avatar
gilbertlee-amd committed
564
565
566
567
568
      {
        exeInfo.totalTime = 0;
        for (auto const& transfer : exeInfo.transfers)
          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
      }
569

gilbertlee-amd's avatar
gilbertlee-amd committed
570
571
572
      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
Gilbert Lee's avatar
Gilbert Lee committed
573

gilbertlee-amd's avatar
gilbertlee-amd committed
574
575
      if (verbose && !ev.outputToCsv)
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
576
577
        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
Gilbert Lee's avatar
Gilbert Lee committed
578
      }
gilbertlee-amd's avatar
gilbertlee-amd committed
579
580
581

      int totalCUs = 0;
      for (auto const& transfer : exeInfo.transfers)
Gilbert Lee's avatar
Gilbert Lee committed
582
      {
583
        transfer->transferTime /= (1.0 * numTimedIterations);
584
585
        transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
        transfer->executorBandwidth = exeBandwidthGbs;
gilbertlee-amd's avatar
gilbertlee-amd committed
586
        totalCUs += transfer->numSubExecs;
gilbertlee-amd's avatar
gilbertlee-amd committed
587
588

        if (!verbose) continue;
Gilbert Lee's avatar
Gilbert Lee committed
589
590
        if (!ev.outputToCsv)
        {
gilbertlee-amd's avatar
gilbertlee-amd committed
591
          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
Gilbert Lee's avatar
Gilbert Lee committed
592
                 transfer->transferIndex,
593
                 transfer->transferBandwidth,
594
                 transfer->transferTime,
gilbertlee-amd's avatar
gilbertlee-amd committed
595
596
597
598
599
                 transfer->numBytesActual,
                 transfer->SrcToStr().c_str(),
                 ExeTypeName[transfer->exeType], transfer->exeIndex,
                 transfer->numSubExecs,
                 transfer->DstToStr().c_str());
600
601
602
603
604
605
606
607
608

          if (ev.showIterations)
          {
            std::set<std::pair<double, int>> times;
            double stdDevTime = 0;
            double stdDevBw = 0;
            for (int i = 0; i < numTimedIterations; i++)
            {
              times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
609
              double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
610
611
612
              stdDevTime += varTime * varTime;

              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
613
              double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
614
615
616
617
618
619
620
621
622
              stdDevBw += varBw * varBw;
            }
            stdDevTime = sqrt(stdDevTime / numTimedIterations);
            stdDevBw = sqrt(stdDevBw / numTimedIterations);

            for (auto t : times)
            {
              double iterDurationMsec = t.first;
              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
gilbertlee-amd's avatar
gilbertlee-amd committed
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);

              std::set<int> usedXccs;
              if (t.second - 1 < transfer->perIterationCUs.size())
              {
                printf(" CUs:");
                for (auto x : transfer->perIterationCUs[t.second - 1])
                {
                  printf(" %02d:%02d", x.first, x.second);
                  usedXccs.insert(x.first);
                }
              }
              printf(" XCCs:");
              for (auto x : usedXccs)
                printf(" %02d", x);
              printf("\n");
639
640
641
            }
            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
          }
Gilbert Lee's avatar
Gilbert Lee committed
642
643
644
        }
        else
        {
gilbertlee-amd's avatar
gilbertlee-amd committed
645
646
647
648
649
650
          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
                 testNum, transfer->transferIndex, transfer->numBytesActual,
                 transfer->SrcToStr().c_str(),
                 MemTypeStr[transfer->exeType], transfer->exeIndex,
                 transfer->DstToStr().c_str(),
                 transfer->numSubExecs,
651
                 transfer->transferBandwidth, transfer->transferTime,
gilbertlee-amd's avatar
gilbertlee-amd committed
652
653
                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
Gilbert Lee's avatar
Gilbert Lee committed
654
        }
Gilbert Lee's avatar
Gilbert Lee committed
655
      }
gilbertlee-amd's avatar
gilbertlee-amd committed
656
657
658

      if (verbose && ev.outputToCsv)
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
659
        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
gilbertlee-amd's avatar
gilbertlee-amd committed
660
               testNum, totalBytesTransferred,
gilbertlee-amd's avatar
gilbertlee-amd committed
661
               MemTypeStr[exeType], exeIndex, totalCUs,
gilbertlee-amd's avatar
gilbertlee-amd committed
662
663
               exeBandwidthGbs, exeDurationMsec);
      }
Gilbert Lee's avatar
Gilbert Lee committed
664
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
665
666
667
668
669
670
  }
  else
  {
    for (auto const& transferPair : transferList)
    {
      Transfer* transfer = transferPair.second;
671
      transfer->transferTime /= (1.0 * numTimedIterations);
672
673
      transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
      transfer->executorBandwidth = transfer->transferBandwidth;
674
      maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
gilbertlee-amd's avatar
gilbertlee-amd committed
675
676
677
      if (!verbose) continue;
      if (!ev.outputToCsv)
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
678
        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
gilbertlee-amd's avatar
gilbertlee-amd committed
679
               transfer->transferIndex,
680
               transfer->transferBandwidth, transfer->transferTime,
gilbertlee-amd's avatar
gilbertlee-amd committed
681
682
683
684
685
               transfer->numBytesActual,
               transfer->SrcToStr().c_str(),
               ExeTypeName[transfer->exeType], transfer->exeIndex,
               transfer->numSubExecs,
               transfer->DstToStr().c_str());
686
687
688
689
690
691
692
693
694

        if (ev.showIterations)
        {
            std::set<std::pair<double, int>> times;
            double stdDevTime = 0;
            double stdDevBw = 0;
            for (int i = 0; i < numTimedIterations; i++)
            {
              times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
695
              double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
696
697
698
              stdDevTime += varTime * varTime;

              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
699
              double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
700
701
702
703
704
705
706
707
708
              stdDevBw += varBw * varBw;
            }
            stdDevTime = sqrt(stdDevTime / numTimedIterations);
            stdDevBw = sqrt(stdDevBw / numTimedIterations);

            for (auto t : times)
            {
              double iterDurationMsec = t.first;
              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
709
              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
gilbertlee-amd's avatar
gilbertlee-amd committed
710
              std::set<int> usedXccs;
711
712
713
714
              if (t.second - 1 < transfer->perIterationCUs.size())
              {
                printf(" CUs:");
                for (auto x : transfer->perIterationCUs[t.second - 1])
gilbertlee-amd's avatar
gilbertlee-amd committed
715
716
717
718
                {
                  printf(" %02d:%02d", x.first, x.second);
                  usedXccs.insert(x.first);
                }
719
              }
gilbertlee-amd's avatar
gilbertlee-amd committed
720
721
722
              printf(" XCCs:");
              for (auto x : usedXccs)
                printf(" %d", x);
723
              printf("\n");
724
725
726
            }
            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
        }
gilbertlee-amd's avatar
gilbertlee-amd committed
727
728
729
      }
      else
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
730
731
732
733
734
735
        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
               testNum, transfer->transferIndex, transfer->numBytesActual,
               transfer->SrcToStr().c_str(),
               ExeTypeName[transfer->exeType], transfer->exeIndex,
               transfer->DstToStr().c_str(),
               transfer->numSubExecs,
736
               transfer->transferBandwidth, transfer->transferTime,
gilbertlee-amd's avatar
gilbertlee-amd committed
737
738
               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
gilbertlee-amd's avatar
gilbertlee-amd committed
739
740
741
      }
    }
  }
Gilbert Lee's avatar
Gilbert Lee committed
742

gilbertlee-amd's avatar
gilbertlee-amd committed
743
744
745
  // Display aggregate statistics
  if (verbose)
  {
Gilbert Lee's avatar
Gilbert Lee committed
746
    if (!ev.outputToCsv)
Gilbert Lee's avatar
Gilbert Lee committed
747
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
748
      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
749
             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
Gilbert Lee's avatar
Gilbert Lee committed
750
751
752
    }
    else
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
753
      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
754
             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
Gilbert Lee's avatar
Gilbert Lee committed
755
756
    }
  }
Gilbert Lee's avatar
Gilbert Lee committed
757

Gilbert Lee's avatar
Gilbert Lee committed
758
  // Release GPU memory
759
cleanup:
Gilbert Lee's avatar
Gilbert Lee committed
760
761
  for (auto exeInfoPair : transferMap)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
762
763
764
765
    ExecutorInfo& exeInfo  = exeInfoPair.second;
    ExeType const exeType  = exeInfoPair.first.first;
    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));

Gilbert Lee's avatar
Gilbert Lee committed
766
767
    for (auto& transfer : exeInfo.transfers)
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
768
769
770
771
772
773
774
775
776
777
778
      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
      {
        MemType const& srcType = transfer->srcType[iSrc];
        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
      }
      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
      {
        MemType const& dstType = transfer->dstType[iDst];
        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
      }
      transfer->subExecParam.clear();
Gilbert Lee's avatar
Gilbert Lee committed
779
780
    }

gilbertlee-amd's avatar
gilbertlee-amd committed
781
    if (IsGpuType(exeType))
Gilbert Lee's avatar
Gilbert Lee committed
782
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
783
784
      int const numStreams = (int)exeInfo.streams.size();
      for (int i = 0; i < numStreams; ++i)
Gilbert Lee's avatar
Gilbert Lee committed
785
      {
Gilbert Lee's avatar
Gilbert Lee committed
786
787
788
        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
Gilbert Lee's avatar
Gilbert Lee committed
789
      }
gilbertlee-amd's avatar
gilbertlee-amd committed
790
791
792

      if (exeType == EXE_GPU_GFX)
      {
793
#if !defined(__NVCC__)
gilbertlee-amd's avatar
gilbertlee-amd committed
794
        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
795
796
797
#else
        DeallocateMemory(MEM_CPU, exeInfo.subExecParamGpu);
#endif
gilbertlee-amd's avatar
gilbertlee-amd committed
798
      }
Gilbert Lee's avatar
Gilbert Lee committed
799
800
801
802
803
804
    }
  }
}

void DisplayUsage(char const* cmdName)
{
Gilbert Lee's avatar
Gilbert Lee committed
805
  printf("TransferBench v%s\n", TB_VERSION);
Gilbert Lee's avatar
Gilbert Lee committed
806
807
808
809
810
811
812
813
814
815
816
817
818
  printf("========================================\n");

  if (numa_available() == -1)
  {
    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
    exit(1);
  }
  int numGpuDevices;
  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
  int const numCpuDevices = numa_num_configured_nodes();

  printf("Usage: %s config <N>\n", cmdName);
  printf("  config: Either:\n");
Gilbert Lee's avatar
Gilbert Lee committed
819
  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
gilbertlee-amd's avatar
gilbertlee-amd committed
820
821
822
  printf("          - Name of preset config:\n");
  printf("              p2p          - Peer-to-peer benchmark tests\n");
  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
823
824
825
826
827
  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
  printf("                             - 4th optional arg: # CPU SubExecs per Transfer\n");
  printf("              scaling      - GPU SubExec scaling copy test\n");
  printf("                             - 3th optional arg: Max # of SubExecs to use\n");
  printf("                             - 4rd optional arg: GPU index to use as executor\n");
gilbertlee-amd's avatar
gilbertlee-amd committed
828
829
  printf("              a2a          - GPU All-To-All benchmark\n");
  printf("                             - 3rd optional arg: # of SubExecs to use\n");
830
  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
Gilbert Lee's avatar
Gilbert Lee committed
831
  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
Gilbert Lee's avatar
Gilbert Lee committed
832
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
Gilbert Lee's avatar
Gilbert Lee committed
833
         DEFAULT_BYTES_PER_TRANSFER);
Gilbert Lee's avatar
Gilbert Lee committed
834
835
836
837
838
839
840
  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
  printf("\n");

  EnvVars::DisplayUsage();
}

gilbertlee-amd's avatar
gilbertlee-amd committed
841
int RemappedIndex(int const origIdx, bool const isCpuType)
Gilbert Lee's avatar
Gilbert Lee committed
842
{
843
844
  static std::vector<int> remappingCpu;
  static std::vector<int> remappingGpu;
Gilbert Lee's avatar
Gilbert Lee committed
845

846
847
848
849
850
851
852
853
  // Build CPU remapping on first use
  // Skip numa nodes that are not configured
  if (remappingCpu.empty())
  {
    for (int node = 0; node <= numa_max_node(); node++)
      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
        remappingCpu.push_back(node);
  }
Gilbert Lee's avatar
Gilbert Lee committed
854

855
856
  // Build remappingGpu on first use
  if (remappingGpu.empty())
Gilbert Lee's avatar
Gilbert Lee committed
857
858
859
  {
    int numGpuDevices;
    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
860
    remappingGpu.resize(numGpuDevices);
Gilbert Lee's avatar
Gilbert Lee committed
861
862
863
864

    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
    if (!usePcieIndexing)
    {
865
      // For HIP-based indexing no remappingGpu is necessary
Gilbert Lee's avatar
Gilbert Lee committed
866
      for (int i = 0; i < numGpuDevices; ++i)
867
        remappingGpu[i] = i;
Gilbert Lee's avatar
Gilbert Lee committed
868
869
870
871
872
873
874
875
876
877
878
879
880
881
    }
    else
    {
      // Collect PCIe address for each GPU
      std::vector<std::pair<std::string, int>> mapping;
      char pciBusId[20];
      for (int i = 0; i < numGpuDevices; ++i)
      {
        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
        mapping.push_back(std::make_pair(pciBusId, i));
      }
      // Sort GPUs by PCIe address then use that as mapping
      std::sort(mapping.begin(), mapping.end());
      for (int i = 0; i < numGpuDevices; ++i)
882
        remappingGpu[i] = mapping[i].second;
Gilbert Lee's avatar
Gilbert Lee committed
883
884
    }
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
885
  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
Gilbert Lee's avatar
Gilbert Lee committed
886
887
888
889
}

void DisplayTopology(bool const outputToCsv)
{
890

891
  int numCpuDevices = numa_num_configured_nodes();
Gilbert Lee's avatar
Gilbert Lee committed
892
893
894
895
896
  int numGpuDevices;
  HIP_CALL(hipGetDeviceCount(&numGpuDevices));

  if (outputToCsv)
  {
897
    printf("NumCpus,%d\n", numCpuDevices);
Gilbert Lee's avatar
Gilbert Lee committed
898
    printf("NumGpus,%d\n", numGpuDevices);
899
900
901
  }
  else
  {
902
903
    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
904
905
906
907
908
909
910
911
  }

  // Print out detected CPU topology
  if (outputToCsv)
  {
    printf("NUMA");
    for (int j = 0; j < numCpuDevices; j++)
      printf(",NUMA%02d", j);
912
    printf(",# CPUs,ClosestGPUs,ActualNode\n");
913
914
915
  }
  else
  {
916
    printf("            |");
917
    for (int j = 0; j < numCpuDevices; j++)
918
919
920
921
      printf("NUMA %02d|", j);
    printf(" #Cpus | Closest GPU(s)\n");

    printf("------------+");
922
    for (int j = 0; j <= numCpuDevices; j++)
923
924
      printf("-------+");
    printf("---------------\n");
925
926
927
928
  }

  for (int i = 0; i < numCpuDevices; i++)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
929
    int nodeI = RemappedIndex(i, true);
930
    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
931
932
    for (int j = 0; j < numCpuDevices; j++)
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
933
      int nodeJ = RemappedIndex(j, true);
934
      int numaDist = numa_distance(nodeI, nodeJ);
935
      if (outputToCsv)
gilbertlee-amd's avatar
gilbertlee-amd committed
936
        printf("%d,", numaDist);
937
      else
938
        printf(" %5d |", numaDist);
939
940
941
942
    }

    int numCpus = 0;
    for (int j = 0; j < numa_num_configured_cpus(); j++)
943
      if (numa_node_of_cpu(j) == nodeI) numCpus++;
944
945
946
    if (outputToCsv)
      printf("%d,", numCpus);
    else
947
      printf(" %5d | ", numCpus);
948

949
#if !defined(__NVCC__)
950
951
952
    bool isFirst = true;
    for (int j = 0; j < numGpuDevices; j++)
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
953
      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
954
955
      {
        if (isFirst) isFirst = false;
gilbertlee-amd's avatar
gilbertlee-amd committed
956
957
        else printf(",");
        printf("%d", j);
958
959
      }
    }
960
#endif
961
962
963
964
    printf("\n");
  }
  printf("\n");

965
966
967
968
969
#if defined(__NVCC__)
  // No further topology detection done for NVIDIA platforms
  return;
#endif

970
971
972
  // Print out detected GPU topology
  if (outputToCsv)
  {
Gilbert Lee's avatar
Gilbert Lee committed
973
974
975
976
977
978
979
    printf("GPU");
    for (int j = 0; j < numGpuDevices; j++)
      printf(",GPU %02d", j);
    printf(",PCIe Bus ID,ClosestNUMA\n");
  }
  else
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
980
981
982
983
984
985
986
987
988
989
    printf("        |");
    for (int j = 0; j < numGpuDevices; j++)
    {
      hipDeviceProp_t prop;
      HIP_CALL(hipGetDeviceProperties(&prop, j));
      std::string fullName = prop.gcnArchName;
      std::string archName = fullName.substr(0, fullName.find(':'));
      printf(" %6s |", archName.c_str());
    }
    printf("\n");
Gilbert Lee's avatar
Gilbert Lee committed
990
991
992
    printf("        |");
    for (int j = 0; j < numGpuDevices; j++)
      printf(" GPU %02d |", j);
gilbertlee-amd's avatar
gilbertlee-amd committed
993
    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
Gilbert Lee's avatar
Gilbert Lee committed
994
995
    for (int j = 0; j <= numGpuDevices; j++)
      printf("--------+");
gilbertlee-amd's avatar
gilbertlee-amd committed
996
    printf("--------------+------+-------------\n");
Gilbert Lee's avatar
Gilbert Lee committed
997
998
  }

999
#if !defined(__NVCC__)
Gilbert Lee's avatar
Gilbert Lee committed
1000
1001
1002
  char pciBusId[20];
  for (int i = 0; i < numGpuDevices; i++)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
1003
    int const deviceIdx = RemappedIndex(i, false);
Gilbert Lee's avatar
Gilbert Lee committed
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
    for (int j = 0; j < numGpuDevices; j++)
    {
      if (i == j)
      {
        if (outputToCsv)
          printf("-,");
        else
          printf("    -   |");
      }
      else
      {
        uint32_t linkType, hopCount;
gilbertlee-amd's avatar
gilbertlee-amd committed
1017
1018
        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
                                              RemappedIndex(j, false),
Gilbert Lee's avatar
Gilbert Lee committed
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
                                              &linkType, &hopCount));
        printf("%s%s-%d%s",
               outputToCsv ? "" : " ",
               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
               hopCount, outputToCsv ? "," : " |");
      }
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
1030
1031
1032
1033
1034
    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));

    int numDeviceCUs = 0;
    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));

Gilbert Lee's avatar
Gilbert Lee committed
1035
    if (outputToCsv)
gilbertlee-amd's avatar
gilbertlee-amd committed
1036
      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
Gilbert Lee's avatar
Gilbert Lee committed
1037
    else
gilbertlee-amd's avatar
gilbertlee-amd committed
1038
      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
Gilbert Lee's avatar
Gilbert Lee committed
1039
  }
1040
#endif
Gilbert Lee's avatar
Gilbert Lee committed
1041
1042
}

1043
void ParseMemType(EnvVars const& ev, std::string const& token,
gilbertlee-amd's avatar
gilbertlee-amd committed
1044
                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
Gilbert Lee's avatar
Gilbert Lee committed
1045
1046
{
  char typeChar;
gilbertlee-amd's avatar
gilbertlee-amd committed
1047
1048
  int offset = 0, devIndex, inc;
  bool found = false;
Gilbert Lee's avatar
Gilbert Lee committed
1049

gilbertlee-amd's avatar
gilbertlee-amd committed
1050
1051
1052
  memTypes.clear();
  memIndices.clear();
  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
Gilbert Lee's avatar
Gilbert Lee committed
1053
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
1054
1055
1056
    offset += inc;
    MemType memType = CharToMemType(typeChar);

1057
    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= ev.numCpuDevices))
Gilbert Lee's avatar
Gilbert Lee committed
1058
    {
1059
      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, devIndex);
Gilbert Lee's avatar
Gilbert Lee committed
1060
1061
      exit(1);
    }
1062
    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= ev.numGpuDevices))
Gilbert Lee's avatar
Gilbert Lee committed
1063
    {
1064
      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, devIndex);
Gilbert Lee's avatar
Gilbert Lee committed
1065
1066
      exit(1);
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082

    found = true;
    if (memType != MEM_NULL)
    {
      memTypes.push_back(memType);
      memIndices.push_back(devIndex);
    }
  }
  if (!found)
  {
    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
           token.c_str(), MemTypeStr);
    exit(1);
  }
}

1083
1084
void ParseExeType(EnvVars const& ev, std::string const& token,
                  ExeType &exeType, int& exeIndex, int& exeSubIndex)
gilbertlee-amd's avatar
gilbertlee-amd committed
1085
1086
{
  char typeChar;
1087
1088
1089
  exeSubIndex = -1;
  int numTokensParsed = sscanf(token.c_str(), " %c%d.%d", &typeChar, &exeIndex, &exeSubIndex);
  if (numTokensParsed < 2)
gilbertlee-amd's avatar
gilbertlee-amd committed
1090
1091
1092
1093
1094
1095
1096
  {
    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
           token.c_str(), ExeTypeStr);
    exit(1);
  }
  exeType = CharToExeType(typeChar);

1097
  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numCpuDevices))
gilbertlee-amd's avatar
gilbertlee-amd committed
1098
  {
1099
    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, exeIndex);
gilbertlee-amd's avatar
gilbertlee-amd committed
1100
1101
    exit(1);
  }
1102
  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numGpuDevices))
gilbertlee-amd's avatar
gilbertlee-amd committed
1103
  {
1104
    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, exeIndex);
Gilbert Lee's avatar
Gilbert Lee committed
1105
1106
    exit(1);
  }
1107
1108
1109
1110
1111
1112
1113
1114
1115
  if (exeType == EXE_GPU_GFX && exeSubIndex != -1)
  {
    int const idx = RemappedIndex(exeIndex, false);
    if (ev.xccIdsPerDevice[idx].count(exeSubIndex) == 0)
    {
      printf("[ERROR] GPU %d does not have subIndex %d\n", exeIndex, exeSubIndex);
      exit(1);
    }
  }
Gilbert Lee's avatar
Gilbert Lee committed
1116
1117
}

Gilbert Lee's avatar
Gilbert Lee committed
1118
// Helper function to parse a list of Transfer definitions
1119
void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transfers)
Gilbert Lee's avatar
Gilbert Lee committed
1120
1121
1122
1123
1124
{
  // Replace any round brackets or '->' with spaces,
  for (int i = 1; line[i]; i++)
    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';

Gilbert Lee's avatar
Gilbert Lee committed
1125
  transfers.clear();
Gilbert Lee's avatar
Gilbert Lee committed
1126

Gilbert Lee's avatar
Gilbert Lee committed
1127
  int numTransfers = 0;
Gilbert Lee's avatar
Gilbert Lee committed
1128
  std::istringstream iss(line);
Gilbert Lee's avatar
Gilbert Lee committed
1129
  iss >> numTransfers;
Gilbert Lee's avatar
Gilbert Lee committed
1130
1131
1132
1133
1134
  if (iss.fail()) return;

  std::string exeMem;
  std::string srcMem;
  std::string dstMem;
Gilbert Lee's avatar
Gilbert Lee committed
1135

gilbertlee-amd's avatar
gilbertlee-amd committed
1136
  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
Gilbert Lee's avatar
Gilbert Lee committed
1137
  // otherwise read triples (srcMem, exeMem, dstMem)
gilbertlee-amd's avatar
gilbertlee-amd committed
1138
  bool const advancedMode = (numTransfers < 0);
Gilbert Lee's avatar
Gilbert Lee committed
1139
1140
  numTransfers = abs(numTransfers);

gilbertlee-amd's avatar
gilbertlee-amd committed
1141
  int numSubExecs;
gilbertlee-amd's avatar
gilbertlee-amd committed
1142
  if (!advancedMode)
Gilbert Lee's avatar
Gilbert Lee committed
1143
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
1144
1145
    iss >> numSubExecs;
    if (numSubExecs <= 0 || iss.fail())
Gilbert Lee's avatar
Gilbert Lee committed
1146
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
1147
      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
Gilbert Lee's avatar
Gilbert Lee committed
1148
1149
1150
1151
      exit(1);
    }
  }

gilbertlee-amd's avatar
gilbertlee-amd committed
1152
  size_t numBytes = 0;
Gilbert Lee's avatar
Gilbert Lee committed
1153
1154
1155
  for (int i = 0; i < numTransfers; i++)
  {
    Transfer transfer;
gilbertlee-amd's avatar
gilbertlee-amd committed
1156
    transfer.numBytes = 0;
gilbertlee-amd's avatar
gilbertlee-amd committed
1157
    transfer.numBytesActual = 0;
gilbertlee-amd's avatar
gilbertlee-amd committed
1158
    if (!advancedMode)
Gilbert Lee's avatar
Gilbert Lee committed
1159
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
      iss >> srcMem >> exeMem >> dstMem;
      if (iss.fail())
      {
        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
        exit(1);
      }
    }
    else
    {
      std::string numBytesToken;
gilbertlee-amd's avatar
gilbertlee-amd committed
1170
      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
gilbertlee-amd's avatar
gilbertlee-amd committed
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
      if (iss.fail())
      {
        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
        exit(1);
      }
      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
      {
        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
        exit(1);
      }
      char units = numBytesToken.back();
gilbertlee-amd's avatar
gilbertlee-amd committed
1182
      switch (toupper(units))
gilbertlee-amd's avatar
gilbertlee-amd committed
1183
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
1184
1185
1186
      case 'K': numBytes *= 1024; break;
      case 'M': numBytes *= 1024*1024; break;
      case 'G': numBytes *= 1024*1024*1024; break;
gilbertlee-amd's avatar
gilbertlee-amd committed
1187
      }
Gilbert Lee's avatar
Gilbert Lee committed
1188
    }
Gilbert Lee's avatar
Gilbert Lee committed
1189

1190
1191
1192
    ParseMemType(ev, srcMem, transfer.srcType, transfer.srcIndex);
    ParseMemType(ev, dstMem, transfer.dstType, transfer.dstIndex);
    ParseExeType(ev, exeMem, transfer.exeType, transfer.exeIndex, transfer.exeSubIndex);
gilbertlee-amd's avatar
gilbertlee-amd committed
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208

    transfer.numSrcs = (int)transfer.srcType.size();
    transfer.numDsts = (int)transfer.dstType.size();
    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
    {
      printf("[ERROR] Transfer must have at least one src or dst\n");
      exit(1);
    }

    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
    {
      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
      exit(1);
    }

    transfer.numSubExecs = numSubExecs;
gilbertlee-amd's avatar
gilbertlee-amd committed
1209
    transfer.numBytes = numBytes;
Gilbert Lee's avatar
Gilbert Lee committed
1210
    transfers.push_back(transfer);
Gilbert Lee's avatar
Gilbert Lee committed
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
  }
}

void EnablePeerAccess(int const deviceId, int const peerDeviceId)
{
  int canAccess;
  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
  if (!canAccess)
  {
    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
    exit(1);
  }
  HIP_CALL(hipSetDevice(deviceId));
Gilbert Lee's avatar
Gilbert Lee committed
1224
1225
1226
1227
1228
1229
1230
  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
  {
    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
           deviceId, peerDeviceId, hipGetErrorString(error));
    exit(1);
  }
Gilbert Lee's avatar
Gilbert Lee committed
1231
1232
1233
1234
1235
1236
1237
1238
1239
}

void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
{
  if (numBytes == 0)
  {
    printf("[ERROR] Unable to allocate 0 bytes\n");
    exit(1);
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
1240
  *memPtr = nullptr;
gilbertlee-amd's avatar
gilbertlee-amd committed
1241
  if (IsCpuType(memType))
Gilbert Lee's avatar
Gilbert Lee committed
1242
1243
  {
    // Set numa policy prior to call to hipHostMalloc
1244
    numa_set_preferred(devIndex);
Gilbert Lee's avatar
Gilbert Lee committed
1245
1246
1247
1248

    // Allocate host-pinned memory (should respect NUMA mem policy)
    if (memType == MEM_CPU_FINE)
    {
1249
1250
1251
1252
#if defined (__NVCC__)
      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
      exit(1);
#else
Gilbert Lee's avatar
Gilbert Lee committed
1253
      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
1254
#endif
Gilbert Lee's avatar
Gilbert Lee committed
1255
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
1256
    else if (memType == MEM_CPU)
Gilbert Lee's avatar
Gilbert Lee committed
1257
    {
1258
1259
1260
#if defined (__NVCC__)
      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
#else
1261
      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
1262
#endif
1263
1264
1265
1266
      {
        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
        exit(1);
      }
Gilbert Lee's avatar
Gilbert Lee committed
1267
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
1268
1269
1270
1271
    else if (memType == MEM_CPU_UNPINNED)
    {
      *memPtr = numa_alloc_onnode(numBytes, devIndex);
    }
Gilbert Lee's avatar
Gilbert Lee committed
1272
1273

    // Check that the allocated pages are actually on the correct NUMA node
gilbertlee-amd's avatar
gilbertlee-amd committed
1274
1275
    memset(*memPtr, 0, numBytes);
    CheckPages((char*)*memPtr, numBytes, devIndex);
Gilbert Lee's avatar
Gilbert Lee committed
1276
1277

    // Reset to default numa mem policy
1278
    numa_set_preferred(-1);
Gilbert Lee's avatar
Gilbert Lee committed
1279
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
1280
  else if (IsGpuType(memType))
Gilbert Lee's avatar
Gilbert Lee committed
1281
  {
1282
1283
1284
1285
1286
1287
1288
1289
    if (memType == MEM_GPU)
    {
      // Allocate GPU memory on appropriate device
      HIP_CALL(hipSetDevice(devIndex));
      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
    }
    else if (memType == MEM_GPU_FINE)
    {
1290
#if defined (__NVCC__)
1291
1292
      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
      exit(1);
1293
#else
1294
1295
      HIP_CALL(hipSetDevice(devIndex));

gilbertlee-amd's avatar
gilbertlee-amd committed
1296
1297
      hipDeviceProp_t prop;
      HIP_CALL(hipGetDeviceProperties(&prop, 0));
1298
      int flag = hipDeviceMallocUncached;
gilbertlee-amd's avatar
gilbertlee-amd committed
1299
      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
1300
#endif
1301
1302
    }
    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
gilbertlee-amd's avatar
gilbertlee-amd committed
1303
    HIP_CALL(hipDeviceSynchronize());
Gilbert Lee's avatar
Gilbert Lee committed
1304
1305
1306
1307
1308
1309
1310
1311
  }
  else
  {
    printf("[ERROR] Unsupported memory type %d\n", memType);
    exit(1);
  }
}

gilbertlee-amd's avatar
gilbertlee-amd committed
1312
void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
Gilbert Lee's avatar
Gilbert Lee committed
1313
1314
1315
{
  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
1316
1317
1318
1319
1320
    if (memPtr == nullptr)
    {
      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
      return;
    }
Gilbert Lee's avatar
Gilbert Lee committed
1321
1322
    HIP_CALL(hipHostFree(memPtr));
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
1323
1324
  else if (memType == MEM_CPU_UNPINNED)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
1325
1326
1327
1328
1329
    if (memPtr == nullptr)
    {
      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
      return;
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
1330
1331
    numa_free(memPtr, bytes);
  }
Gilbert Lee's avatar
Gilbert Lee committed
1332
1333
  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
1334
1335
1336
1337
1338
    if (memPtr == nullptr)
    {
      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
      return;
    }
Gilbert Lee's avatar
Gilbert Lee committed
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
    HIP_CALL(hipFree(memPtr));
  }
}

void CheckPages(char* array, size_t numBytes, int targetId)
{
  unsigned long const pageSize = getpagesize();
  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;

  std::vector<void *> pages(numPages);
  std::vector<int> status(numPages);

  pages[0] = array;
  for (int i = 1; i < numPages; i++)
  {
    pages[i] = (char*)pages[i-1] + pageSize;
  }

  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
  if (retCode)
  {
    printf("[ERROR] Unable to collect page info\n");
    exit(1);
  }

  size_t mistakeCount = 0;
  for (int i = 0; i < numPages; i++)
  {
    if (status[i] < 0)
    {
      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
      exit(1);
    }
    if (status[i] != targetId) mistakeCount++;
  }
  if (mistakeCount > 0)
  {
    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
    exit(1);
  }
}

1381
1382
1383
uint32_t GetId(uint32_t hwId)
{
  // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
1384
1385
1386
  int const shId = (hwId >> 12) &  1;
  int const cuId = (hwId >>  8) & 15;
  int const seId = (hwId >> 13) &  3;
1387
1388
1389
  return (shId << 5) + (cuId << 2) + seId;
}

1390
void RunTransfer(EnvVars const& ev, int const iteration,
Gilbert Lee's avatar
Gilbert Lee committed
1391
                 ExecutorInfo& exeInfo, int const transferIdx)
Gilbert Lee's avatar
Gilbert Lee committed
1392
{
gilbertlee-amd's avatar
gilbertlee-amd committed
1393
  Transfer* transfer = exeInfo.transfers[transferIdx];
Gilbert Lee's avatar
Gilbert Lee committed
1394

gilbertlee-amd's avatar
gilbertlee-amd committed
1395
  if (transfer->exeType == EXE_GPU_GFX)
Gilbert Lee's avatar
Gilbert Lee committed
1396
1397
  {
    // Switch to executing GPU
gilbertlee-amd's avatar
gilbertlee-amd committed
1398
    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
Gilbert Lee's avatar
Gilbert Lee committed
1399
1400
    HIP_CALL(hipSetDevice(exeIndex));

Gilbert Lee's avatar
Gilbert Lee committed
1401
1402
1403
    hipStream_t& stream     = exeInfo.streams[transferIdx];
    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
Gilbert Lee's avatar
Gilbert Lee committed
1404

gilbertlee-amd's avatar
gilbertlee-amd committed
1405
1406
1407
1408
    // Figure out how many threadblocks to use.
    // In single stream mode, all the threadblocks for this GPU are launched
    // Otherwise, just launch the threadblocks associated with this single Transfer
    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
1409
1410
    int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);

1411
1412
#if defined(__NVCC__)
    HIP_CALL(hipEventRecord(startEvent, stream));
1413
    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
1414
1415
    HIP_CALL(hipEventRecord(stopEvent, stream));
#else
gilbertlee-amd's avatar
gilbertlee-amd committed
1416
    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
1417
                          dim3(numXCCs, numBlocksToRun, 1),
1418
                          dim3(ev.blockSize, 1, 1),
gilbertlee-amd's avatar
gilbertlee-amd committed
1419
1420
1421
                          ev.sharedMemBytes, stream,
                          startEvent, stopEvent,
                          0, transfer->subExecParamGpuPtr);
1422
#endif
Gilbert Lee's avatar
Gilbert Lee committed
1423
1424
    // Synchronize per iteration, unless in single sync mode, in which case
    // synchronize during last warmup / last actual iteration
Gilbert Lee's avatar
Gilbert Lee committed
1425
    HIP_CALL(hipStreamSynchronize(stream));
Gilbert Lee's avatar
Gilbert Lee committed
1426
1427
1428
1429

    if (iteration >= 0)
    {
      // Record GPU timing
Gilbert Lee's avatar
Gilbert Lee committed
1430
1431
      float gpuDeltaMsec;
      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
Gilbert Lee's avatar
Gilbert Lee committed
1432

Gilbert Lee's avatar
Gilbert Lee committed
1433
1434
      if (ev.useSingleStream)
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
1435
        // Figure out individual timings for Transfers that were all launched together
gilbertlee-amd's avatar
gilbertlee-amd committed
1436
        for (Transfer* currTransfer : exeInfo.transfers)
Gilbert Lee's avatar
Gilbert Lee committed
1437
        {
1438
1439
1440
          long long minStartCycle = std::numeric_limits<long long>::max();
          long long maxStopCycle  = std::numeric_limits<long long>::min();

gilbertlee-amd's avatar
gilbertlee-amd committed
1441
          std::set<std::pair<int,int>> CUs;
1442
          for (auto subExecIdx : currTransfer->subExecIdx)
Gilbert Lee's avatar
Gilbert Lee committed
1443
          {
1444
1445
1446
            minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
            maxStopCycle  = std::max(maxStopCycle,  exeInfo.subExecParamGpu[subExecIdx].stopCycle);
            if (ev.showIterations)
gilbertlee-amd's avatar
gilbertlee-amd committed
1447
1448
              CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
                                        GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
Gilbert Lee's avatar
Gilbert Lee committed
1449
          }
1450
          int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
Gilbert Lee's avatar
Gilbert Lee committed
1451
          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
gilbertlee-amd's avatar
gilbertlee-amd committed
1452
          currTransfer->transferTime += iterationTimeMs;
1453
          if (ev.showIterations)
1454
          {
1455
            currTransfer->perIterationTime.push_back(iterationTimeMs);
1456
1457
            currTransfer->perIterationCUs.push_back(CUs);
          }
Gilbert Lee's avatar
Gilbert Lee committed
1458
        }
Gilbert Lee's avatar
Gilbert Lee committed
1459
1460
1461
1462
        exeInfo.totalTime += gpuDeltaMsec;
      }
      else
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
1463
        transfer->transferTime += gpuDeltaMsec;
1464
        if (ev.showIterations)
1465
        {
1466
          transfer->perIterationTime.push_back(gpuDeltaMsec);
gilbertlee-amd's avatar
gilbertlee-amd committed
1467
          std::set<std::pair<int,int>> CUs;
1468
          for (int i = 0; i < transfer->numSubExecs; i++)
gilbertlee-amd's avatar
gilbertlee-amd committed
1469
1470
            CUs.insert(std::make_pair(transfer->subExecParamGpuPtr[i].xccId,
                                      GetId(transfer->subExecParamGpuPtr[i].hwId)));
1471
1472
          transfer->perIterationCUs.push_back(CUs);
        }
Gilbert Lee's avatar
Gilbert Lee committed
1473
1474
1475
      }
    }
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
  else if (transfer->exeType == EXE_GPU_DMA)
  {
    // Switch to executing GPU
    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
    HIP_CALL(hipSetDevice(exeIndex));

    hipStream_t& stream     = exeInfo.streams[transferIdx];
    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];

    HIP_CALL(hipEventRecord(startEvent, stream));
    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
    {
      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
                              MEMSET_CHAR, transfer->numBytesActual, stream));
    }
    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
    {
      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
                              transfer->numBytesActual, hipMemcpyDefault,
                              stream));
    }
    HIP_CALL(hipEventRecord(stopEvent, stream));
    HIP_CALL(hipStreamSynchronize(stream));

    if (iteration >= 0)
    {
      // Record GPU timing
      float gpuDeltaMsec;
      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
      transfer->transferTime += gpuDeltaMsec;
1507
1508
      if (ev.showIterations)
        transfer->perIterationTime.push_back(gpuDeltaMsec);
gilbertlee-amd's avatar
gilbertlee-amd committed
1509
1510
1511
    }
  }
  else if (transfer->exeType == EXE_CPU) // CPU execution agent
Gilbert Lee's avatar
Gilbert Lee committed
1512
1513
  {
    // Force this thread and all child threads onto correct NUMA node
gilbertlee-amd's avatar
gilbertlee-amd committed
1514
    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
1515
    if (numa_run_on_node(exeIndex))
Gilbert Lee's avatar
Gilbert Lee committed
1516
    {
1517
      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
Gilbert Lee's avatar
Gilbert Lee committed
1518
1519
1520
1521
1522
1523
1524
      exit(1);
    }

    std::vector<std::thread> childThreads;

    auto cpuStart = std::chrono::high_resolution_clock::now();

gilbertlee-amd's avatar
gilbertlee-amd committed
1525
1526
1527
    // Launch each subExecutor in child-threads to perform memcopies
    for (int i = 0; i < transfer->numSubExecs; ++i)
      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
Gilbert Lee's avatar
Gilbert Lee committed
1528
1529

    // Wait for child-threads to finish
gilbertlee-amd's avatar
gilbertlee-amd committed
1530
    for (int i = 0; i < transfer->numSubExecs; ++i)
Gilbert Lee's avatar
Gilbert Lee committed
1531
1532
1533
1534
1535
1536
      childThreads[i].join();

    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;

    // Record time if not a warmup iteration
    if (iteration >= 0)
1537
1538
1539
1540
1541
1542
    {
      double const delta = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
      transfer->transferTime += delta;
      if (ev.showIterations)
        transfer->perIterationTime.push_back(delta);
    }
Gilbert Lee's avatar
Gilbert Lee committed
1543
1544
1545
  }
}

gilbertlee-amd's avatar
gilbertlee-amd committed
1546
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
Gilbert Lee's avatar
Gilbert Lee committed
1547
{
gilbertlee-amd's avatar
gilbertlee-amd committed
1548
1549
  ev.DisplayP2PBenchmarkEnvVars();

1550
1551
1552
  char const separator = ev.outputToCsv ? ',' : ' ';
  printf("Bytes Per Direction%c%lu\n", separator, N * sizeof(float));

Gilbert Lee's avatar
Gilbert Lee committed
1553
  // Collect the number of available CPUs/GPUs on this machine
gilbertlee-amd's avatar
gilbertlee-amd committed
1554
1555
  int const numCpus    = ev.numCpuDevices;
  int const numGpus    = ev.numGpuDevices;
Gilbert Lee's avatar
Gilbert Lee committed
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
  int const numDevices = numCpus + numGpus;

  // Enable peer to peer for each GPU
  for (int i = 0; i < numGpus; i++)
    for (int j = 0; j < numGpus; j++)
      if (i != j) EnablePeerAccess(i, j);

  // Perform unidirectional / bidirectional
  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
  {
1566
1567
1568
    if (ev.p2pMode == 1 && isBidirectional == 1 ||
        ev.p2pMode == 2 && isBidirectional == 0) continue;

1569
1570
1571
1572
1573
    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
           ev.useRemoteRead ? "Remote" : "Local",
           ev.useRemoteRead ? "Local" : "Remote",
           ev.useDmaCopy    ? "DMA"   : "GFX");

Gilbert Lee's avatar
Gilbert Lee committed
1574
    // Print header
1575
    if (isBidirectional)
Gilbert Lee's avatar
Gilbert Lee committed
1576
    {
1577
1578
1579
1580
1581
1582
      printf("%12s", "SRC\\DST");
    }
    else
    {
      if (ev.useRemoteRead)
        printf("%12s", "SRC\\EXE+DST");
1583
      else
1584
1585
1586
1587
1588
1589
1590
1591
        printf("%12s", "SRC+EXE\\DST");
    }
    if (ev.outputToCsv) printf(",");
    for (int i = 0; i < numCpus; i++)
    {
      printf("%7s %02d", "CPU", i);
      if (ev.outputToCsv) printf(",");
    }
1592
    if (numCpus > 0) printf("   ");
1593
1594
1595
1596
    for (int i = 0; i < numGpus; i++)
    {
      printf("%7s %02d", "GPU", i);
      if (ev.outputToCsv) printf(",");
Gilbert Lee's avatar
Gilbert Lee committed
1597
    }
1598
1599
    printf("\n");

1600
1601
1602
    double avgBwSum[2][2] = {};
    int    avgCount[2][2] = {};

1603
    ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
Gilbert Lee's avatar
Gilbert Lee committed
1604
1605
1606
    // Loop over all possible src/dst pairs
    for (int src = 0; src < numDevices; src++)
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
1607
1608
      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
1609
1610
1611
      MemType const srcTypeActual = ((ev.useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
                                     (ev.useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
                                                                               srcType);
1612
1613
1614
1615
      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
      std::vector<std::vector<double>> stdDev(isBidirectional + 1);
gilbertlee-amd's avatar
gilbertlee-amd committed
1616

1617
      if (src == numCpus && src != 0) printf("\n");
Gilbert Lee's avatar
Gilbert Lee committed
1618
1619
      for (int dst = 0; dst < numDevices; dst++)
      {
gilbertlee-amd's avatar
gilbertlee-amd committed
1620
1621
        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
1622
1623
1624
        MemType const dstTypeActual = ((ev.useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
                                       (ev.useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
                                                                                 dstType);
1625
1626
1627
1628
1629
        // Prepare Transfers
        std::vector<Transfer> transfers(isBidirectional + 1);

        // SRC -> DST
        transfers[0].numBytes = N * sizeof(float);
1630
1631
        transfers[0].srcType.push_back(srcTypeActual);
        transfers[0].dstType.push_back(dstTypeActual);
1632
1633
1634
1635
1636
        transfers[0].srcIndex.push_back(srcIndex);
        transfers[0].dstIndex.push_back(dstIndex);
        transfers[0].numSrcs = transfers[0].numDsts = 1;
        transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
        transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
1637
        transfers[0].exeSubIndex = -1;
1638
1639
1640
1641
1642
1643
1644
        transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;

        // DST -> SRC
        if (isBidirectional)
        {
          transfers[1].numBytes = N * sizeof(float);
          transfers[1].numSrcs = transfers[1].numDsts = 1;
1645
1646
          transfers[1].srcType.push_back(dstTypeActual);
          transfers[1].dstType.push_back(srcTypeActual);
1647
1648
1649
1650
          transfers[1].srcIndex.push_back(dstIndex);
          transfers[1].dstIndex.push_back(srcIndex);
          transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
          transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
1651
          transfers[1].exeSubIndex = -1;
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
          transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
        }

        bool skipTest = false;

        // Abort if executing on NUMA node with no CPUs
        for (int i = 0; i <= isBidirectional; i++)
        {
          if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
          {
            skipTest = true;
            break;
          }

#if defined(__NVCC__)
          // NVIDIA platform cannot access GPU memory directly from CPU executors
          if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
          {
            skipTest = true;
            break;
          }
#endif
        }

        if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;

        if (!skipTest)
        {
          ExecuteTransfers(ev, 0, N, transfers, false);

          for (int dir = 0; dir <= isBidirectional; dir++)
          {
1684
            double const avgTime = transfers[dir].transferTime;
1685
1686
1687
            double const avgBw   = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
            avgBandwidth[dir].push_back(avgBw);

1688
1689
1690
1691
1692
1693
            if (!(srcType == dstType && srcIndex == dstIndex))
            {
              avgBwSum[srcType][dstType] += avgBw;
              avgCount[srcType][dstType]++;
            }

1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
            if (ev.showIterations)
            {
              double minTime = transfers[dir].perIterationTime[0];
              double maxTime = transfers[dir].perIterationTime[0];
              double varSum  = 0;
              for (int i = 0; i < transfers[dir].perIterationTime.size(); i++)
              {
                minTime = std::min(minTime, transfers[dir].perIterationTime[i]);
                maxTime = std::max(maxTime, transfers[dir].perIterationTime[i]);
                double const bw  = (transfers[dir].numBytesActual / 1.0E9) / transfers[dir].perIterationTime[i] * 1000.0f;
                double const delta = (avgBw - bw);
                varSum += delta * delta;
              }
              double const minBw = (transfers[dir].numBytesActual / 1.0E9) / maxTime * 1000.0f;
              double const maxBw = (transfers[dir].numBytesActual / 1.0E9) / minTime * 1000.0f;
              double const stdev = sqrt(varSum / transfers[dir].perIterationTime.size());
              minBandwidth[dir].push_back(minBw);
              maxBandwidth[dir].push_back(maxBw);
              stdDev[dir].push_back(stdev);
            }
          }
        }
        else
Gilbert Lee's avatar
Gilbert Lee committed
1717
        {
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
          for (int dir = 0; dir <= isBidirectional; dir++)
          {
            avgBandwidth[dir].push_back(0);
            minBandwidth[dir].push_back(0);
            maxBandwidth[dir].push_back(0);
            stdDev[dir].push_back(-1.0);
          }
        }
      }

      for (int dir = 0; dir <= isBidirectional; dir++)
      {
        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
        if (ev.outputToCsv) printf(",");

        for (int dst = 0; dst < numDevices; dst++)
        {
1735
          if (dst == numCpus && dst != 0) printf("   ");
1736
1737
1738
          double const avgBw = avgBandwidth[dir][dst];

          if (avgBw == 0.0)
Gilbert Lee's avatar
Gilbert Lee committed
1739
1740
            printf("%10s", "N/A");
          else
1741
1742
            printf("%10.2f", avgBw);
          if (ev.outputToCsv) printf(",");
Gilbert Lee's avatar
Gilbert Lee committed
1743
        }
1744
1745
1746
        printf("\n");

        if (ev.showIterations)
Gilbert Lee's avatar
Gilbert Lee committed
1747
        {
1748
1749
1750
1751
1752
1753
          // minBw
          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
          if (ev.outputToCsv) printf(",");
          for (int i = 0; i < numDevices; i++)
          {
            double const minBw = minBandwidth[dir][i];
1754
            if (i == numCpus && i != 0) printf("   ");
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
            if (minBw == 0.0)
              printf("%10s", "N/A");
            else
              printf("%10.2f", minBw);
            if (ev.outputToCsv) printf(",");
          }
          printf("\n");

          // maxBw
          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
          if (ev.outputToCsv) printf(",");
          for (int i = 0; i < numDevices; i++)
          {
            double const maxBw = maxBandwidth[dir][i];
1769
            if (i == numCpus && i != 0) printf("   ");
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
            if (maxBw == 0.0)
              printf("%10s", "N/A");
            else
              printf("%10.2f", maxBw);
            if (ev.outputToCsv) printf(",");
          }
          printf("\n");

          // stddev
          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
          if (ev.outputToCsv) printf(",");
          for (int i = 0; i < numDevices; i++)
          {
            double const sd = stdDev[dir][i];
1784
            if (i == numCpus && i != 0) printf("   ");
1785
1786
1787
1788
1789
1790
1791
            if (sd == -1.0)
              printf("%10s", "N/A");
            else
              printf("%10.2f", sd);
            if (ev.outputToCsv) printf(",");
          }
          printf("\n");
Gilbert Lee's avatar
Gilbert Lee committed
1792
1793
1794
        }
        fflush(stdout);
      }
1795
1796
1797
1798
1799
1800
1801
1802

      if (isBidirectional)
      {
        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
        if (ev.outputToCsv) printf(",");
        for (int dst = 0; dst < numDevices; dst++)
        {
          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
1803
          if (dst == numCpus && dst != 0) printf("   ");
1804
1805
1806
1807
1808
1809
          if (sumBw == 0.0)
            printf("%10s", "N/A");
          else
            printf("%10.2f", sumBw);
          if (ev.outputToCsv) printf(",");
        }
1810
1811
        printf("\n");
        if (src < numDevices - 1) printf("\n");
1812
      }
Gilbert Lee's avatar
Gilbert Lee committed
1813
    }
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833

    if (!ev.outputToCsv)
    {
      printf("                         ");
      for (int srcType : {MEM_CPU, MEM_GPU})
        for (int dstType : {MEM_CPU, MEM_GPU})
          printf("  %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
      printf("\n");

      printf("Averages (During %s):",  isBidirectional ? " BiDir" : "UniDir");
      for (int srcType : {MEM_CPU, MEM_GPU})
        for (int dstType : {MEM_CPU, MEM_GPU})
        {
          if (avgCount[srcType][dstType])
            printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
          else
            printf("%10s", "N/A");
        }
      printf("\n\n");
    }
Gilbert Lee's avatar
Gilbert Lee committed
1834
1835
1836
  }
}

1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs)
{
  ev.DisplayEnvVars();

  // Collect the number of available CPUs/GPUs on this machine
  int const numCpus    = ev.numCpuDevices;
  int const numGpus    = ev.numGpuDevices;
  int const numDevices = numCpus + numGpus;

  // Enable peer to peer for each GPU
  for (int i = 0; i < numGpus; i++)
    for (int j = 0; j < numGpus; j++)
      if (i != j) EnablePeerAccess(i, j);

  char separator = (ev.outputToCsv ? ',' : ' ');

  std::vector<Transfer> transfers(1);
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
  Transfer& t = transfers[0];
  t.numBytes = N * sizeof(float);
  t.numSrcs  = 1;
  t.numDsts  = 1;
  t.exeType  = EXE_GPU_GFX;
  t.exeIndex = exeIndex;
  t.exeSubIndex = -1;
  t.srcType.resize(1, MEM_GPU);
  t.dstType.resize(1, MEM_GPU);
  t.srcIndex.resize(1);
  t.dstIndex.resize(1);
1865
1866
1867

  printf("GPU-GFX Scaling benchmark:\n");
  printf("==========================\n");
1868
  printf("- Copying %lu bytes from GPU %d to other devices\n", t.numBytes, exeIndex);
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
  printf("- All numbers reported as GB/sec\n\n");

  printf("NumCUs");
  for (int i = 0; i < numDevices; i++)
    printf("%c  %s%02d     ", separator, i < numCpus ? "CPU" : "GPU", i < numCpus ? i : i - numCpus);
  printf("\n");

  std::vector<std::pair<double, int>> bestResult(numDevices);
  for (int numSubExec = 1; numSubExec <= maxSubExecs; numSubExec++)
  {
1879
    t.numSubExecs = numSubExec;
1880
1881
1882
1883
    printf("%4d  ", numSubExec);

    for (int i = 0; i < numDevices; i++)
    {
1884
1885
      t.dstType[0]  = i < numCpus ? MEM_CPU : MEM_GPU;
      t.dstIndex[0] = i < numCpus ? i : i - numCpus;
1886
1887

      ExecuteTransfers(ev, 0, N, transfers, false);
1888
      printf("%c%7.2f     ", separator, t.transferBandwidth);
1889

1890
      if (t.transferBandwidth > bestResult[i].first)
1891
      {
1892
        bestResult[i].first  = t.transferBandwidth;
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
        bestResult[i].second = numSubExec;
      }
    }
    printf("\n");
  }

  printf(" Best ");
  for (int i = 0; i < numDevices; i++)
  {
    printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
  }
  printf("\n");
}

gilbertlee-amd's avatar
gilbertlee-amd committed
1907
1908
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs)
{
1909
  ev.DisplayA2AEnvVars();
gilbertlee-amd's avatar
gilbertlee-amd committed
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926

  // Collect the number of GPU devices to use
  int const numGpus = ev.numGpuDevices;

  // Enable peer to peer for each GPU
  for (int i = 0; i < numGpus; i++)
    for (int j = 0; j < numGpus; j++)
      if (i != j) EnablePeerAccess(i, j);

  char separator = (ev.outputToCsv ? ',' : ' ');

  Transfer transfer;
  transfer.numBytes    = numBytesPerTransfer;
  transfer.numSubExecs = numSubExecs;
  transfer.numSrcs     = 1;
  transfer.numDsts     = 1;
  transfer.exeType     = EXE_GPU_GFX;
1927
  transfer.exeSubIndex = -1;
1928
1929
  transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
  transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
gilbertlee-amd's avatar
gilbertlee-amd committed
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
  transfer.srcIndex.resize(1);
  transfer.dstIndex.resize(1);

  std::vector<Transfer> transfers;
  for (int i = 0; i < numGpus; i++)
  {
    transfer.srcIndex[0] = i;
    for (int j = 0; j < numGpus; j++)
    {
      transfer.dstIndex[0] = j;
1940
1941
      transfer.exeIndex    = (ev.useRemoteRead ? j : i);

1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
      if (ev.a2aDirect)
      {
#if !defined(__NVCC__)
        if (i == j) continue;

        uint32_t linkType, hopCount;
        HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
                                              RemappedIndex(j, false),
                                              &linkType, &hopCount));
        if (hopCount != 1) continue;
#endif
      }
gilbertlee-amd's avatar
gilbertlee-amd committed
1954
1955
1956
1957
1958
1959
      transfers.push_back(transfer);
    }
  }

  printf("GPU-GFX All-To-All benchmark:\n");
  printf("==========================\n");
1960
1961
1962
  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
         numBytesPerTransfer, ev.a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
  if (transfers.size() == 0) return;
gilbertlee-amd's avatar
gilbertlee-amd committed
1963
1964

  double totalBandwidthCpu = 0;
1965
  ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, !ev.hideEnv, &totalBandwidthCpu);
gilbertlee-amd's avatar
gilbertlee-amd committed
1966
1967
1968

  printf("\nSummary:\n");
  printf("==========================================================\n");
1969
  printf("SRC\\DST ");
gilbertlee-amd's avatar
gilbertlee-amd committed
1970
  for (int dst = 0; dst < numGpus; dst++)
1971
1972
    printf("%cGPU %02d    ", separator, dst);
  printf("   %cSTotal     %cActual\n", separator, separator);
1973
1974
1975
1976
1977
1978
1979

  std::map<std::pair<int, int>, int> reIndex;
  for (int i = 0; i < transfers.size(); i++)
  {
    Transfer const& t = transfers[i];
    reIndex[std::make_pair(t.srcIndex[0], t.dstIndex[0])] = i;
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
1980

1981
  double totalBandwidthGpu = 0.0;
1982
1983
  double minExecutorBandwidth = std::numeric_limits<double>::max();
  double maxExecutorBandwidth = 0.0;
1984
  std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
gilbertlee-amd's avatar
gilbertlee-amd committed
1985
1986
  for (int src = 0; src < numGpus; src++)
  {
1987
    double rowTotalBandwidth = 0;
1988
    double executorBandwidth = 0;
gilbertlee-amd's avatar
gilbertlee-amd committed
1989
1990
1991
    printf("GPU %02d", src);
    for (int dst = 0; dst < numGpus; dst++)
    {
1992
1993
1994
      if (reIndex.count(std::make_pair(src, dst)))
      {
        Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
1995
1996
1997
1998
1999
        colTotalBandwidth[dst]  += transfer.transferBandwidth;
        rowTotalBandwidth       += transfer.transferBandwidth;
        totalBandwidthGpu       += transfer.transferBandwidth;
        executorBandwidth        = std::max(executorBandwidth, transfer.executorBandwidth);
        printf("%c%8.3f  ", separator, transfer.transferBandwidth);
2000
2001
2002
      }
      else
      {
2003
        printf("%c%8s  ", separator, "N/A");
2004
      }
gilbertlee-amd's avatar
gilbertlee-amd committed
2005
    }
2006
2007
2008
    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
    minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
    maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
2009
    colTotalBandwidth[numGpus] += rowTotalBandwidth;
gilbertlee-amd's avatar
gilbertlee-amd committed
2010
  }
2011
2012
2013
  printf("\nRTotal");
  for (int dst = 0; dst < numGpus; dst++)
  {
2014
    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
2015
  }
2016
2017
  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus],
         separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
2018
2019
  printf("\n");

2020
2021
2022
  printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
  printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
  printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", totalBandwidthCpu);
gilbertlee-amd's avatar
gilbertlee-amd committed
2023
2024
}

gilbertlee-amd's avatar
gilbertlee-amd committed
2025
void Transfer::PrepareSubExecParams(EnvVars const& ev)
Gilbert Lee's avatar
Gilbert Lee committed
2026
{
gilbertlee-amd's avatar
gilbertlee-amd committed
2027
2028
2029
2030
2031
2032
2033
  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
  // Figure out the sub-array each subExecutor works on for this Transfer
  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
  //   except the very last one, for alignment reasons
  size_t const N              = this->numBytesActual / sizeof(float);
  int    const initOffset     = ev.byteOffset / sizeof(float);
  int    const targetMultiple = ev.blockBytes / sizeof(float);
Gilbert Lee's avatar
Gilbert Lee committed
2034

gilbertlee-amd's avatar
gilbertlee-amd committed
2035
  // In some cases, there may not be enough data for all subExectors
2036
  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
gilbertlee-amd's avatar
gilbertlee-amd committed
2037
2038
  this->subExecParam.clear();
  this->subExecParam.resize(this->numSubExecs);
Gilbert Lee's avatar
Gilbert Lee committed
2039
2040

  size_t assigned = 0;
gilbertlee-amd's avatar
gilbertlee-amd committed
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
  for (int i = 0; i < this->numSubExecs; ++i)
  {
    int    const subExecLeft = std::max(0, maxSubExecToUse - i);
    size_t const leftover    = N - assigned;
    size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;

    SubExecParam& p = this->subExecParam[i];
    p.N             = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
    p.numSrcs       = this->numSrcs;
    p.numDsts       = this->numDsts;
    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
    for (int iDst = 0; iDst < this->numDsts; ++iDst)
      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;

2056
    p.preferredXccId = -1;
2057
2058

    if (ev.useXccFilter && this->exeType == EXE_GPU_GFX)
2059
    {
2060
2061
2062
2063
2064
2065
2066
2067
      std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1);

      // Use this tranfer's executor subIndex if set
      if (this->exeSubIndex != -1)
      {
        p.preferredXccId = this->exeSubIndex;
      }
      else if (this->numDsts >= 1 && IsGpuType(this->dstType[0]))
2068
2069
2070
      {
        p.preferredXccId = ev.prefXccTable[this->exeIndex][this->dstIndex[0]];
      }
2071
2072
2073
2074
2075

      if (p.preferredXccId == -1)
      {
        p.preferredXccId = distribution(*ev.generator);
      }
2076
2077
    }

gilbertlee-amd's avatar
gilbertlee-amd committed
2078
2079
2080
2081
2082
    if (ev.enableDebug)
    {
      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
             this->transferIndex, i, p.N, assigned, assigned + p.N);
    }
Gilbert Lee's avatar
Gilbert Lee committed
2083

gilbertlee-amd's avatar
gilbertlee-amd committed
2084
2085
2086
    p.startCycle = 0;
    p.stopCycle  = 0;
    assigned += p.N;
Gilbert Lee's avatar
Gilbert Lee committed
2087
2088
  }

Gilbert Lee's avatar
Gilbert Lee committed
2089
  this->transferTime = 0.0;
2090
  this->perIterationTime.clear();
Gilbert Lee's avatar
Gilbert Lee committed
2091
2092
}

gilbertlee-amd's avatar
gilbertlee-amd committed
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
{
  size_t N = buffer.size();
  if (bufferIdx >= 0)
  {
    size_t patternLen = ev.fillPattern.size();
    if (patternLen > 0)
    {
      for (size_t i = 0; i < N; ++i)
        buffer[i] = ev.fillPattern[i % patternLen];
    }
    else
    {
      for (size_t i = 0; i < N; ++i)
2107
        buffer[i] = PrepSrcValue(bufferIdx, i);
gilbertlee-amd's avatar
gilbertlee-amd committed
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
    }
  }
  else // Destination buffer
  {
    if (this->numSrcs == 0)
    {
      // Note: 0x75757575 = 13323083.0
      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
    }
    else
    {
      PrepareReference(ev, buffer, 0);

      if (this->numSrcs > 1)
      {
        std::vector<float> temp(N);
        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
        {
          PrepareReference(ev, temp, srcIdx);
          for (int i = 0; i < N; ++i)
          {
            buffer[i] += temp[i];
          }
        }
      }
    }
  }
}

2137
bool Transfer::PrepareSrc(EnvVars const& ev)
gilbertlee-amd's avatar
gilbertlee-amd committed
2138
{
2139
  if (this->numSrcs == 0) return true;
gilbertlee-amd's avatar
gilbertlee-amd committed
2140
2141
2142
2143
2144
2145
  size_t const N = this->numBytesActual / sizeof(float);
  int const initOffset = ev.byteOffset / sizeof(float);

  std::vector<float> reference(N);
  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
2146
    float* srcPtr = this->srcMem[srcIdx] + initOffset;
2147
    PrepareReference(ev, reference, srcIdx);
gilbertlee-amd's avatar
gilbertlee-amd committed
2148
2149
2150

    // Initialize source memory array with reference pattern
    if (IsGpuType(this->srcType[srcIdx]))
2151
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
2152
2153
2154
      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
      HIP_CALL(hipSetDevice(deviceIdx));
      if (ev.usePrepSrcKernel)
2155
        PrepSrcDataKernel<<<32, ev.blockSize>>>(srcPtr, N, srcIdx);
gilbertlee-amd's avatar
gilbertlee-amd committed
2156
2157
      else
        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
2158
2159
      HIP_CALL(hipDeviceSynchronize());
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
2160
    else if (IsCpuType(this->srcType[srcIdx]))
2161
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
2162
      memcpy(srcPtr, reference.data(), this->numBytesActual);
2163
    }
2164
2165

    // Perform check just to make sure that data has been copied properly
gilbertlee-amd's avatar
gilbertlee-amd committed
2166
    float* srcCheckPtr = srcPtr;
2167
    std::vector<float> srcCopy(N);
gilbertlee-amd's avatar
gilbertlee-amd committed
2168
2169
2170
2171
2172
2173
2174
2175
2176
    if (IsGpuType(this->srcType[srcIdx]))
    {
      if (!ev.validateDirect)
      {
        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
        HIP_CALL(hipDeviceSynchronize());
        srcCheckPtr = srcCopy.data();
      }
    }
2177
2178
2179

    for (size_t i = 0; i < N; ++i)
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
2180
      if (reference[i] != srcCheckPtr[i])
2181
2182
      {
        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
2183
2184
2185
#if !defined(__NVCC__)
        float const val = this->srcMem[srcIdx][initOffset + i];
        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
gilbertlee-amd's avatar
gilbertlee-amd committed
2186
               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
2187
#else
gilbertlee-amd's avatar
gilbertlee-amd committed
2188
        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
2189
#endif
2190
2191
2192
2193
2194
2195
2196
2197
2198
        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
               this->transferIndex,
               this->SrcToStr().c_str(),
               ExeTypeStr[this->exeType], this->exeIndex,
               this->numSubExecs,
               this->DstToStr().c_str());
        if (!ev.continueOnError)
          exit(1);
2199
        return false;
2200
2201
      }
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
2202
  }
2203
  return true;
gilbertlee-amd's avatar
gilbertlee-amd committed
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
}

void Transfer::ValidateDst(EnvVars const& ev)
{
  if (this->numDsts == 0) return;
  size_t const N = this->numBytesActual / sizeof(float);
  int const initOffset = ev.byteOffset / sizeof(float);

  std::vector<float> reference(N);
  PrepareReference(ev, reference, -1);

  std::vector<float> hostBuffer(N);
  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
  {
    float* output;
2219
    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
gilbertlee-amd's avatar
gilbertlee-amd committed
2220
2221
2222
2223
2224
    {
      output = this->dstMem[dstIdx] + initOffset;
    }
    else
    {
gilbertlee-amd's avatar
gilbertlee-amd committed
2225
2226
      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
      HIP_CALL(hipSetDevice(deviceIdx));
gilbertlee-amd's avatar
gilbertlee-amd committed
2227
      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
gilbertlee-amd's avatar
gilbertlee-amd committed
2228
      HIP_CALL(hipDeviceSynchronize());
gilbertlee-amd's avatar
gilbertlee-amd committed
2229
2230
2231
2232
2233
2234
2235
      output = hostBuffer.data();
    }

    for (size_t i = 0; i < N; ++i)
    {
      if (reference[i] != output[i])
      {
2236
2237
2238
2239
2240
        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
        {
          float srcVal;
          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
2241
2242
2243
2244
2245
#if !defined(__NVCC__)
          float val = this->srcMem[srcIdx][initOffset + i];
          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
#else
2246
          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
2247
#endif
2248
        }
2249
        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
2250
2251
2252
2253
2254
#if !defined(__NVCC__)
        float dstVal = this->dstMem[dstIdx][initOffset + i];
        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
#else
2255
        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
2256
#endif
gilbertlee-amd's avatar
gilbertlee-amd committed
2257
2258
2259
2260
2261
2262
        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
               this->transferIndex,
               this->SrcToStr().c_str(),
               ExeTypeStr[this->exeType], this->exeIndex,
               this->numSubExecs,
               this->DstToStr().c_str());
2263
2264
        if (!ev.continueOnError)
          exit(1);
2265
2266
        else
          break;
gilbertlee-amd's avatar
gilbertlee-amd committed
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
      }
    }
  }
}

std::string Transfer::SrcToStr() const
{
  if (numSrcs == 0) return "N";
  std::stringstream ss;
  for (int i = 0; i < numSrcs; ++i)
    ss << MemTypeStr[srcType[i]] << srcIndex[i];
  return ss.str();
}

std::string Transfer::DstToStr() const
{
  if (numDsts == 0) return "N";
  std::stringstream ss;
  for (int i = 0; i < numDsts; ++i)
    ss << MemTypeStr[dstType[i]] << dstIndex[i];
  return ss.str();
}

2290
2291
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
{
2292
  char memType = ev.useFineGrain ? 'F' : 'G';
2293
  printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
2294
2295
2296
2297
2298
2299
2300
2301
2302
  printf("       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote Copy |\n");
  printf("  #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
	 memType, localIdx, localIdx,
	 localIdx, memType, localIdx,
	 memType, localIdx, localIdx, memType, localIdx,
	 memType, remoteIdx, localIdx,
	 localIdx, memType, remoteIdx,
	 memType, localIdx, localIdx, memType, remoteIdx);
  printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390

  std::vector<Transfer> transfers(1);
  Transfer& t   = transfers[0];
  t.exeType     = EXE_GPU_GFX;
  t.exeIndex    = localIdx;
  t.exeSubIndex = -1;
  t.numBytes    = numBytesPerTransfer;

  for (int numCUs = 1; numCUs <= maxSubExecs; numCUs++)
  {
    t.numSubExecs = numCUs;

    // Local Read
    t.numSrcs = 1;
    t.numDsts = 0;
    t.srcType.resize(t.numSrcs);
    t.dstType.resize(t.numDsts);
    t.srcIndex.resize(t.numSrcs);
    t.dstIndex.resize(t.numDsts);
    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.srcIndex[0] = localIdx;
    ExecuteTransfers(ev, 0, 0, transfers, false);
    double const localRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;

    // Local Write
    t.numSrcs = 0;
    t.numDsts = 1;
    t.srcType.resize(t.numSrcs);
    t.dstType.resize(t.numDsts);
    t.srcIndex.resize(t.numSrcs);
    t.dstIndex.resize(t.numDsts);
    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.dstIndex[0] = localIdx;
    ExecuteTransfers(ev, 0, 0, transfers, false);
    double const localWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;

    // Local Copy
    t.numSrcs = 1;
    t.numDsts = 1;
    t.srcType.resize(t.numSrcs);
    t.dstType.resize(t.numDsts);
    t.srcIndex.resize(t.numSrcs);
    t.dstIndex.resize(t.numDsts);
    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.srcIndex[0] = localIdx;
    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.dstIndex[0] = localIdx;
    ExecuteTransfers(ev, 0, 0, transfers, false);
    double const localCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;

    // Remote Read
    t.numSrcs = 1;
    t.numDsts = 0;
    t.srcType.resize(t.numSrcs);
    t.dstType.resize(t.numDsts);
    t.srcIndex.resize(t.numSrcs);
    t.dstIndex.resize(t.numDsts);
    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.srcIndex[0] = remoteIdx;
    ExecuteTransfers(ev, 0, 0, transfers, false);
    double const remoteRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;

    // Remote Write
    t.numSrcs = 0;
    t.numDsts = 1;
    t.srcType.resize(t.numSrcs);
    t.dstType.resize(t.numDsts);
    t.srcIndex.resize(t.numSrcs);
    t.dstIndex.resize(t.numDsts);
    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.dstIndex[0] = remoteIdx;
    ExecuteTransfers(ev, 0, 0, transfers, false);
    double const remoteWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;

    // Remote Copy
    t.numSrcs = 1;
    t.numDsts = 1;
    t.srcType.resize(t.numSrcs);
    t.dstType.resize(t.numDsts);
    t.srcIndex.resize(t.numSrcs);
    t.dstIndex.resize(t.numDsts);
    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.srcIndex[0] = localIdx;
    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
    t.dstIndex[0] = remoteIdx;
    ExecuteTransfers(ev, 0, 0, transfers, false);
    double const remoteCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;

2391
    printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
2392
2393
2394
2395
2396
           numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
  }
}


gilbertlee-amd's avatar
gilbertlee-amd committed
2397
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
Gilbert Lee's avatar
Gilbert Lee committed
2398
2399
2400
2401
{
  ev.DisplaySweepEnvVars();

  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
gilbertlee-amd's avatar
gilbertlee-amd committed
2402
  std::vector<std::pair<ExeType, int>> exeList;
Gilbert Lee's avatar
Gilbert Lee committed
2403
2404
  for (auto exe : ev.sweepExe)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
2405
2406
    ExeType const exeType = CharToExeType(exe);
    if (IsGpuType(exeType))
Gilbert Lee's avatar
Gilbert Lee committed
2407
    {
2408
      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
gilbertlee-amd's avatar
gilbertlee-amd committed
2409
        exeList.push_back(std::make_pair(exeType, exeIndex));
Gilbert Lee's avatar
Gilbert Lee committed
2410
    }
gilbertlee-amd's avatar
gilbertlee-amd committed
2411
    else if (IsCpuType(exeType))
Gilbert Lee's avatar
Gilbert Lee committed
2412
    {
2413
2414
2415
2416
      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
      {
        // Skip NUMA nodes that have no CPUs (e.g. CXL)
        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
gilbertlee-amd's avatar
gilbertlee-amd committed
2417
        exeList.push_back(std::make_pair(exeType, exeIndex));
2418
      }
Gilbert Lee's avatar
Gilbert Lee committed
2419
2420
    }
  }
2421
  int numExes = exeList.size();
Gilbert Lee's avatar
Gilbert Lee committed
2422
2423
2424
2425

  std::vector<std::pair<MemType, int>> srcList;
  for (auto src : ev.sweepSrc)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
2426
2427
    MemType const srcType = CharToMemType(src);
    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
2428

Gilbert Lee's avatar
Gilbert Lee committed
2429
    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
gilbertlee-amd's avatar
gilbertlee-amd committed
2430
      srcList.push_back(std::make_pair(srcType, srcIndex));
Gilbert Lee's avatar
Gilbert Lee committed
2431
2432
2433
2434
2435
2436
2437
  }
  int numSrcs = srcList.size();


  std::vector<std::pair<MemType, int>> dstList;
  for (auto dst : ev.sweepDst)
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
2438
2439
    MemType const dstType = CharToMemType(dst);
    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
Gilbert Lee's avatar
Gilbert Lee committed
2440
2441

    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
gilbertlee-amd's avatar
gilbertlee-amd committed
2442
      dstList.push_back(std::make_pair(dstType, dstIndex));
Gilbert Lee's avatar
Gilbert Lee committed
2443
2444
2445
  }
  int numDsts = dstList.size();

2446
2447
  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
  struct TransferInfo
Gilbert Lee's avatar
Gilbert Lee committed
2448
  {
gilbertlee-amd's avatar
gilbertlee-amd committed
2449
2450
2451
    MemType srcType; int srcIndex;
    ExeType exeType; int exeIndex;
    MemType dstType; int dstIndex;
2452
2453
2454
2455
2456
2457
2458
2459
  };

  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);

  std::vector<TransferInfo> possibleTransfers;
  TransferInfo tinfo;
  for (int i = 0; i < numExes; ++i)
Gilbert Lee's avatar
Gilbert Lee committed
2460
  {
2461
2462
    // Skip CPU executors if XGMI link must be used
    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
gilbertlee-amd's avatar
gilbertlee-amd committed
2463
2464
    tinfo.exeType  = exeList[i].first;
    tinfo.exeIndex = exeList[i].second;
2465

gilbertlee-amd's avatar
gilbertlee-amd committed
2466
    bool isXgmiSrc  = false;
2467
2468
2469
2470
2471
2472
2473
    int  numHopsSrc = 0;
    for (int j = 0; j < numSrcs; ++j)
    {
      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
      {
        if (exeList[i].second != srcList[j].second)
        {
2474
2475
2476
#if defined(__NVCC__)
          isXgmiSrc = false;
#else
2477
          uint32_t exeToSrcLinkType, exeToSrcHopCount;
gilbertlee-amd's avatar
gilbertlee-amd committed
2478
2479
          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
                                                RemappedIndex(srcList[j].second, false),
2480
2481
2482
2483
                                                &exeToSrcLinkType,
                                                &exeToSrcHopCount));
          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
2484
#endif
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
        }
        else
        {
          isXgmiSrc = true;
          numHopsSrc = 0;
        }

        // Skip this SRC if it is not XGMI but only XGMI links may be used
        if (useXgmiOnly && !isXgmiSrc) continue;

        // Skip this SRC if XGMI distance is already past limit
        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
      }
      else if (useXgmiOnly) continue;

gilbertlee-amd's avatar
gilbertlee-amd committed
2500
2501
      tinfo.srcType  = srcList[j].first;
      tinfo.srcIndex = srcList[j].second;
2502
2503
2504
2505
2506
2507
2508
2509
2510

      bool isXgmiDst = false;
      int  numHopsDst = 0;
      for (int k = 0; k < numDsts; ++k)
      {
        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
        {
          if (exeList[i].second != dstList[k].second)
          {
2511
2512
2513
#if defined(__NVCC__)
            isXgmiSrc = false;
#else
2514
            uint32_t exeToDstLinkType, exeToDstHopCount;
gilbertlee-amd's avatar
gilbertlee-amd committed
2515
2516
            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
                                                  RemappedIndex(dstList[k].second, false),
2517
2518
2519
2520
                                                  &exeToDstLinkType,
                                                  &exeToDstHopCount));
            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
            if (isXgmiDst) numHopsDst = exeToDstHopCount;
2521
#endif
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
          }
          else
          {
            isXgmiDst = true;
            numHopsDst = 0;
          }
        }

        // Skip this DST if it is not XGMI but only XGMI links may be used
        if (useXgmiOnly && !isXgmiDst) continue;

        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;

        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;

2539
2540
2541
2542
2543
2544
#if defined(__NVCC__)
        // Skip CPU executors on GPU memory on NVIDIA platform
        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
          continue;
#endif

gilbertlee-amd's avatar
gilbertlee-amd committed
2545
2546
        tinfo.dstType  = dstList[k].first;
        tinfo.dstIndex = dstList[k].second;
2547
2548
2549
2550

        possibleTransfers.push_back(tinfo);
      }
    }
Gilbert Lee's avatar
Gilbert Lee committed
2551
2552
  }

2553
2554
2555
  int const numPossible = (int)possibleTransfers.size();
  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);

Gilbert Lee's avatar
Gilbert Lee committed
2556
2557
2558
2559
2560
2561
  if (ev.sweepMin > numPossible)
  {
    printf("No valid test configurations exist\n");
    return;
  }

2562
2563
2564
2565
2566
2567
  if (ev.outputToCsv)
  {
    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
  }

Gilbert Lee's avatar
Gilbert Lee committed
2568
2569
  int numTestsRun = 0;
  int M = ev.sweepMin;
gilbertlee-amd's avatar
gilbertlee-amd committed
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);

  // Log sweep to configuration file
  FILE *fp = fopen("lastSweep.cfg", "w");
  if (!fp)
  {
    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
    exit(1);
  }

Gilbert Lee's avatar
Gilbert Lee committed
2581
2582
2583
2584
2585
2586
2587
2588
2589
  // Create bitmask of numPossible triplets, of which M will be chosen
  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
  auto cpuStart = std::chrono::high_resolution_clock::now();
  while (1)
  {
    if (isRandom)
    {
      // Pick random number of simultaneous transfers to execute
      // NOTE: This currently skews distribution due to some #s having more possibilities than others
gilbertlee-amd's avatar
gilbertlee-amd committed
2590
      M = distribution(*ev.generator);
Gilbert Lee's avatar
Gilbert Lee committed
2591
2592
2593
2594

      // Generate a random bitmask
      for (int i = 0; i < numPossible; i++)
        bitmask[i] = (i < M) ? 1 : 0;
2595
      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
Gilbert Lee's avatar
Gilbert Lee committed
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
    }

    // Convert bitmask to list of Transfers
    std::vector<Transfer> transfers;
    for (int value = 0; value < numPossible; ++value)
    {
      if (bitmask[value])
      {
        // Convert integer value to (SRC->EXE->DST) triplet
        Transfer transfer;
gilbertlee-amd's avatar
gilbertlee-amd committed
2606
2607
2608
2609
2610
        transfer.numSrcs        = 1;
        transfer.numDsts        = 1;
        transfer.srcType        = {possibleTransfers[value].srcType};
        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
        transfer.exeType        = possibleTransfers[value].exeType;
2611
        transfer.exeIndex       = possibleTransfers[value].exeIndex;
2612
        transfer.exeSubIndex    = -1;
gilbertlee-amd's avatar
gilbertlee-amd committed
2613
2614
2615
        transfer.dstType        = {possibleTransfers[value].dstType};
        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
gilbertlee-amd's avatar
gilbertlee-amd committed
2616
        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
Gilbert Lee's avatar
Gilbert Lee committed
2617
2618
2619
2620
        transfers.push_back(transfer);
      }
    }

gilbertlee-amd's avatar
gilbertlee-amd committed
2621
2622
    LogTransfers(fp, ++numTestsRun, transfers);
    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
Gilbert Lee's avatar
Gilbert Lee committed
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653

    // Check for test limit
    if (numTestsRun == ev.sweepTestLimit)
    {
      printf("Test limit reached\n");
      break;
    }

    // Check for time limit
    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
    {
      printf("Time limit exceeded\n");
      break;
    }

    // Increment bitmask if not random sweep
    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
    {
      M++;
      // Check for completion
      if (M > maxParallelTransfers)
      {
        printf("Sweep complete\n");
        break;
      }
      for (int i = 0; i < numPossible; i++)
        bitmask[i] = (i < M) ? 1 : 0;
    }
  }
gilbertlee-amd's avatar
gilbertlee-amd committed
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
  fclose(fp);
}

void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
{
  fprintf(fp, "# Test %d\n", testNum);
  fprintf(fp, "%d", -1 * (int)transfers.size());
  for (auto const& transfer : transfers)
  {
    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
gilbertlee-amd's avatar
gilbertlee-amd committed
2664
2665
2666
2667
            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
            transfer.numSubExecs,
gilbertlee-amd's avatar
gilbertlee-amd committed
2668
2669
2670
2671
            transfer.numBytes);
  }
  fprintf(fp, "\n");
  fflush(fp);
Gilbert Lee's avatar
Gilbert Lee committed
2672
}
gilbertlee-amd's avatar
gilbertlee-amd committed
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683

std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
{
  std::stringstream ss;
  for (int i = 0; i < strVector.size(); ++i)
  {
    if (i) ss << " ";
    ss << (strVector[i] + initOffset);
  }
  return ss.str();
}