TransferBench.hpp 9.57 KB
Newer Older
Gilbert Lee's avatar
Gilbert Lee committed
1
/*
2
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Gilbert Lee's avatar
Gilbert Lee committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
22
#pragma once
Gilbert Lee's avatar
Gilbert Lee committed
23
24
25
26
27
28
29
30
31
32
33
34

#include <vector>
#include <sstream>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <set>
#include <unistd.h>
#include <map>
#include <iostream>
#include <sstream>
35
#include "Compatibility.hpp"
gilbertlee-amd's avatar
gilbertlee-amd committed
36
37
#include "EnvVars.hpp"

Gilbert Lee's avatar
Gilbert Lee committed
38
// Simple configuration parameters
Gilbert Lee's avatar
Gilbert Lee committed
39
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
Gilbert Lee's avatar
Gilbert Lee committed
40

41
42
#define MAX_LINE_LEN 32768

Gilbert Lee's avatar
Gilbert Lee committed
43
44
45
// Different src/dst memory types supported
typedef enum
{
gilbertlee-amd's avatar
gilbertlee-amd committed
46
47
48
49
  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
  MEM_GPU          = 1, // Coarse-grained global GPU memory
  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
gilbertlee-amd's avatar
gilbertlee-amd committed
50
51
  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
  MEM_NULL         = 5, // NULL memory - used for empty
52
  MEM_MANAGED      = 6
Gilbert Lee's avatar
Gilbert Lee committed
53
54
} MemType;

gilbertlee-amd's avatar
gilbertlee-amd committed
55
typedef enum
gilbertlee-amd's avatar
gilbertlee-amd committed
56
{
gilbertlee-amd's avatar
gilbertlee-amd committed
57
58
59
60
  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
} ExeType;
Gilbert Lee's avatar
Gilbert Lee committed
61

62
bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
gilbertlee-amd's avatar
gilbertlee-amd committed
63
64
65
66
bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
bool IsCpuType(ExeType e) { return (e == EXE_CPU); };

67
char const MemTypeStr[8] = "CGBFUNM";
gilbertlee-amd's avatar
gilbertlee-amd committed
68
69
char const ExeTypeStr[4] = "CGD";
char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
Gilbert Lee's avatar
Gilbert Lee committed
70

Gilbert Lee's avatar
Gilbert Lee committed
71
72
MemType inline CharToMemType(char const c)
{
gilbertlee-amd's avatar
gilbertlee-amd committed
73
  char const* val = strchr(MemTypeStr, toupper(c));
74
  if (val) return (MemType)(val - MemTypeStr);
gilbertlee-amd's avatar
gilbertlee-amd committed
75
76
  printf("[ERROR] Unexpected memory type (%c)\n", c);
  exit(1);
Gilbert Lee's avatar
Gilbert Lee committed
77
78
}

gilbertlee-amd's avatar
gilbertlee-amd committed
79
ExeType inline CharToExeType(char const c)
Gilbert Lee's avatar
Gilbert Lee committed
80
{
gilbertlee-amd's avatar
gilbertlee-amd committed
81
  char const* val = strchr(ExeTypeStr, toupper(c));
82
  if (val) return (ExeType)(val - ExeTypeStr);
gilbertlee-amd's avatar
gilbertlee-amd committed
83
84
85
  printf("[ERROR] Unexpected executor type (%c)\n", c);
  exit(1);
}
Gilbert Lee's avatar
Gilbert Lee committed
86

gilbertlee-amd's avatar
gilbertlee-amd committed
87
88
// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
// then writes the summation to each of the specified destination memory location(s)
Gilbert Lee's avatar
Gilbert Lee committed
89
struct Transfer
Gilbert Lee's avatar
Gilbert Lee committed
90
{
91
  // Inputs
92
93
  ExeType                    exeType;            // Transfer executor type
  int                        exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
94
  int                        exeSubIndex;        // Executor subindex
95
96
97
98
99
100
101
102
103
  int                        numSubExecs;        // Number of subExecutors to use for this Transfer
  size_t                     numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
  int                        numSrcs;            // Number of sources
  std::vector<MemType>       srcType;            // Source memory types
  std::vector<int>           srcIndex;           // Source device indice
  int                        numDsts;            // Number of destinations
  std::vector<MemType>       dstType;            // Destination memory type
  std::vector<int>           dstIndex;           // Destination device index

104
105
  // Outputs
  size_t                     numBytesActual;     // Actual number of bytes to copy
106
107
108
  double                     transferTime;       // Time taken in milliseconds for this transfer
  double                     transferBandwidth;  // Transfer bandwidth (GB/s)
  double                     executorBandwidth;  // Executor bandwidth (GB/s)
109
110
111
112
113
114
115
  std::vector<double>        perIterationTime;   // Per-iteration timing
  std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage

  // Internal
  int                        transferIndex;      // Transfer identifier (within a Test)
  std::vector<float*>        srcMem;             // Source memory
  std::vector<float*>        dstMem;             // Destination memory
116
117
  std::vector<SubExecParam>  subExecParam;       // Defines subarrays assigned to each threadblock
  SubExecParam*              subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
118
  std::vector<int>           subExecIdx;         // Indicies into subExecParamGpu
119

120
121
122
123
124
125
126
127
#if !defined(__NVCC__)
  // For targeted-SDMA
  hsa_agent_t                dstAgent;           // DMA destination memory agent
  hsa_agent_t                srcAgent;           // DMA source memory agent
  hsa_signal_t               signal;             // HSA signal for completion
  hsa_amd_sdma_engine_id_t   sdmaEngineId;       // DMA engine ID
#endif

gilbertlee-amd's avatar
gilbertlee-amd committed
128
129
130
131
  // Prepares src/dst subarray pointers for each SubExecutor
  void PrepareSubExecParams(EnvVars const& ev);

  // Prepare source arrays with input data
132
  bool PrepareSrc(EnvVars const& ev);
gilbertlee-amd's avatar
gilbertlee-amd committed
133
134
135
136
137
138
139
140
141
142

  // Validate that destination data contains expected results
  void ValidateDst(EnvVars const& ev);

  // Prepare reference buffers
  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);

  // String representation functions
  std::string SrcToStr() const;
  std::string DstToStr() const;
Gilbert Lee's avatar
Gilbert Lee committed
143
144
145
146
};

struct ExecutorInfo
{
gilbertlee-amd's avatar
gilbertlee-amd committed
147
148
149
  std::vector<Transfer*>   transfers;        // Transfers to execute
  size_t                   totalBytes;       // Total bytes this executor transfers
  int                      totalSubExecs;    // Total number of subExecutors to use
Gilbert Lee's avatar
Gilbert Lee committed
150
151

  // For GPU-Executors
gilbertlee-amd's avatar
gilbertlee-amd committed
152
  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
Gilbert Lee's avatar
Gilbert Lee committed
153
154
155
156
157
158
159
160
  std::vector<hipStream_t> streams;
  std::vector<hipEvent_t>  startEvents;
  std::vector<hipEvent_t>  stopEvents;

  // Results
  double totalTime;
};

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
struct ExeResult
{
  double bandwidthGbs;
  double durationMsec;
  double sumBandwidthGbs;
  size_t totalBytes;
  std::vector<int> transferIdx;
};

struct TestResults
{
  size_t numTimedIterations;
  size_t totalBytesTransferred;
  double totalBandwidthCpu;
  double totalDurationMsec;
  double overheadMsec;
  std::map<std::pair<ExeType, int>, ExeResult> exeResults;
};

gilbertlee-amd's avatar
gilbertlee-amd committed
180
typedef std::pair<ExeType, int> Executor;
Gilbert Lee's avatar
Gilbert Lee committed
181
typedef std::map<Executor, ExecutorInfo> TransferMap;
Gilbert Lee's avatar
Gilbert Lee committed
182
183
184
185
186
187
188
189

// Display usage instructions
void DisplayUsage(char const* cmdName);

// Display detected GPU topology / CPU numa nodes
void DisplayTopology(bool const outputToCsv);

// Build array of test sizes based on sampling factor
Gilbert Lee's avatar
Gilbert Lee committed
190
void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
Gilbert Lee's avatar
Gilbert Lee committed
191
192
                       std::vector<size_t>& valuesofN);

193
194
void ParseMemType(EnvVars const& ev, std::string const& token, std::vector<MemType>& memType, std::vector<int>& memIndex);
void ParseExeType(EnvVars const& ev, std::string const& token, ExeType& exeType, int& exeIndex, int& exeSubIndex);
Gilbert Lee's avatar
Gilbert Lee committed
195

196
void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transfers);
Gilbert Lee's avatar
Gilbert Lee committed
197

gilbertlee-amd's avatar
gilbertlee-amd committed
198
void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
gilbertlee-amd's avatar
gilbertlee-amd committed
199
200
                      std::vector<Transfer>& transfers, bool verbose = true,
                      double* totalBandwidthCpu = nullptr);
201
202
TestResults ExecuteTransfersImpl(EnvVars const& ev, std::vector<Transfer>& transfers);
void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results);
Gilbert Lee's avatar
Gilbert Lee committed
203
204
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
gilbertlee-amd's avatar
gilbertlee-amd committed
205
void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
Gilbert Lee's avatar
Gilbert Lee committed
206
void CheckPages(char* byteArray, size_t numBytes, int targetId);
207
void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
gilbertlee-amd's avatar
gilbertlee-amd committed
208
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
209
void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs);
gilbertlee-amd's avatar
gilbertlee-amd committed
210
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
gilbertlee-amd's avatar
gilbertlee-amd committed
211
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
212
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
213
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
214
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
215
void RunHealthCheck(EnvVars ev);
Gilbert Lee's avatar
Gilbert Lee committed
216
217

std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
gilbertlee-amd's avatar
gilbertlee-amd committed
218
219

int RemappedIndex(int const origIdx, bool const isCpuType);
gilbertlee-amd's avatar
gilbertlee-amd committed
220
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
gilbertlee-amd's avatar
gilbertlee-amd committed
221
std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);