Unverified Commit 9658305f authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Header-only TransferBench library refactor (#134)

parent b56d4817
...@@ -6,3 +6,4 @@ _static/ ...@@ -6,3 +6,4 @@ _static/
_templates/ _templates/
_toc.yml _toc.yml
docBin/ docBin/
TransferBench
...@@ -3,6 +3,30 @@ ...@@ -3,6 +3,30 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.54
### Modified
- Refactored TransferBench into a header-only library combined with a thin client to facilitate the
use of TransferBench as the backend for other applications
- Optimized how data validation is handled - this should speed up Tests with many parallel transfers as data is only
generated once
- Preset benchmarks now no longer take in any extra command line arguments. Preset settings are only accessed via
environment variables. Details for each preset are printed
- The a2a preset benchmark now defaults to using fine-grained memory and GFX unroll of 2
- Refactored how Transfers are launched in parallel which has reduced some CPU-side overheads
- CPU and DMA executor timing now use CPU wall clock timing instead of slowest Transfer time
### Added
- New one2all preset which sweeps over all subests of parallel transfers from one GPU to others
- Adding new warnings for DMA execution relating to how HIP will default to using agents from the source memory
### Removed
- CU scaling preset has been removed. Similar functionality already exists in the schmoo preset benchmark
- Preparation of source data via GFX kernel has been removed (USE_PREP_KERNEL)
- Removed GFX block-reordering (BLOCK_ORDER)
- Removed NUM_CPU_DEVICES and NUM_GPU_DEVICES from common env vars and only into the presets they apply to.
- Removed SHARED_MEM_BYTES option for GFX executor
- Removed USE_PCIE_INDEX, and SHARED_MEM_BYTES
### Fixed
- Fixed a potential timing reporting issue when DMA executed Transfers end up getting serialized.
## v1.53 ## v1.53
### Added ### Added
- Added ability to specify NULL for sweep preset as source or destination memory type - Added ability to specify NULL for sweep preset as source or destination memory type
......
# Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH}) if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory") set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
else() else()
...@@ -6,13 +7,13 @@ else() ...@@ -6,13 +7,13 @@ else()
endif() endif()
cmake_minimum_required(VERSION 3.5) cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.51.0 LANGUAGES CXX) project(TransferBench VERSION 1.54.0 LANGUAGES CXX)
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib") set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 --std=c++20 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include) include_directories(${ROCM_PATH}/include)
link_libraries(numa hsa-runtime64 pthread) link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..) set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/TransferBench.cpp) add_executable(TransferBench src/client/Client.cpp)
target_include_directories(TransferBench PRIVATE src/include) target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH}) find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
include(ROCMInstallTargets) include(ROCMInstallTargets)
......
# #
# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
all: # Configuration options
cd src ; make ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc
NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
else
EXE=TransferBench
endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 --std=c++20 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread
all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean: clean:
cd src ; make clean rm -f *.o ./TransferBench ./TransferBenchCuda
...@@ -8,8 +8,8 @@ import re ...@@ -8,8 +8,8 @@ import re
from rocm_docs import ROCmDocs from rocm_docs import ROCmDocs
with open('../src/include/EnvVars.hpp', encoding='utf-8') as f: with open('../src/header/TransferBench.hpp', encoding='utf-8') as f:
match = re.search(r'#define TB_VERSION "([0-9.]+)[^0-9.]+', f.read()) match = re.search(r'constexpr char VERSION\[\] = "([0-9.]+)[^0-9.]+', f.read())
if not match: if not match:
raise ValueError("VERSION not found!") raise ValueError("VERSION not found!")
version_number = match[1] version_number = match[1]
...@@ -18,7 +18,7 @@ left_nav_title = f"TransferBench {version_number} Documentation" ...@@ -18,7 +18,7 @@ left_nav_title = f"TransferBench {version_number} Documentation"
# for PDF output on Read the Docs # for PDF output on Read the Docs
project = "TransferBench Documentation" project = "TransferBench Documentation"
author = "Advanced Micro Devices, Inc." author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved." copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
version = version_number version = version_number
release = version_number release = version_number
......
...@@ -47,7 +47,7 @@ To build documentation locally, use: ...@@ -47,7 +47,7 @@ To build documentation locally, use:
.. code-block:: bash .. code-block:: bash
cd docs cd docs
pip3 install -r .sphinx/requirements.txt pip3 install -r ./sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
NVIDIA platform support NVIDIA platform support
......
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc
NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=../TransferBenchCuda
else
EXE=../TransferBench
endif
CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -O3 -Iinclude -x cu -lnuma -arch=native
LDFLAGS += -lpthread
all: $(EXE)
../TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
../TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $< -o $@ $(LDFLAGS)
clean:
rm -f *.o ../TransferBench ../TransferBenchCuda
This diff is collapsed.
/*
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "Client.hpp"
#include "Presets.hpp"
#include "Topology.hpp"
#include <fstream>
int main(int argc, char **argv) {
// Collect environment variables
EnvVars ev;
// Display usage instructions and detected topology
if (argc <= 1) {
if (!ev.outputToCsv) {
DisplayUsage(argv[0]);
DisplayPresets();
}
DisplayTopology(ev.outputToCsv);
exit(0);
}
// Determine number of bytes to run per Transfer
size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
if (argc > 2) {
// Adjust bytes if unit specified
char units = argv[2][strlen(argv[2])-1];
switch (units) {
case 'G': case 'g': numBytesPerTransfer *= 1024;
case 'M': case 'm': numBytesPerTransfer *= 1024;
case 'K': case 'k': numBytesPerTransfer *= 1024;
}
}
if (numBytesPerTransfer % 4) {
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
exit(1);
}
// Run preset benchmark if requested
if (RunPreset(ev, numBytesPerTransfer, argc, argv)) exit(0);
// Read input from command line or configuration file
std::vector<std::string> lines;
{
std::string line;
if (!strcmp(argv[1], "cmdline")) {
for (int i = 3; i < argc; i++)
line += std::string(argv[i]) + " ";
lines.push_back(line);
} else {
std::ifstream cfgFile(argv[1]);
if (!cfgFile.is_open()) {
printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
exit(1);
}
while (std::getline(cfgFile, line))
lines.push_back(line);
cfgFile.close();
}
}
// Print environment variables and CSV header
ev.DisplayEnvVars();
if (ev.outputToCsv)
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
TransferBench::TestResults results;
std::vector<ErrResult> errors;
// Process each line as a Test
int testNum = 0;
for (std::string const &line : lines) {
// Check if line is a comment to be echoed to output (starts with ##)
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s\n", line.c_str());
// Parse set of parallel Transfers to execute
std::vector<Transfer> transfers;
CheckForError(TransferBench::ParseTransfers(line, transfers));
if (transfers.empty()) continue;
// Check for variable sub-executors Transfers
int numVariableTransfers = 0;
int maxVarCount = 0;
{
std::map<ExeDevice, int> varTransferCount;
for (auto const& t : transfers) {
if (t.numSubExecs == 0) {
if (t.exeDevice.exeType != EXE_GPU_GFX) {
printf("[ERROR] Variable number of subexecutors is only supported on GFX executors\n");
exit(1);
}
numVariableTransfers++;
varTransferCount[t.exeDevice]++;
maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
}
}
if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
printf("[ERROR] All or none of the Transfers in the Test must use variable number of Subexecutors\n");
exit(1);
}
}
// Run the specified numbers of bytes otherwise generate a range of values
for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
do {
for (auto& t : transfers)
t.numBytes = currBytes;
if (maxVarCount == 0) {
if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
PrintResults(ev, ++testNum, transfers, results);
}
PrintErrors(results.errResults);
} else {
// Variable subexecutors - Determine how many subexecutors to sweep up to
int maxNumVarSubExec = ev.maxNumVarSubExec;
if (maxNumVarSubExec == 0) {
maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) / maxVarCount;
}
TransferBench::TestResults bestResults;
std::vector<Transfer> bestTransfers;
for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec; numSubExecs++) {
std::vector<Transfer> tempTransfers = transfers;
for (auto& t : tempTransfers) {
if (t.numSubExecs == 0) t.numSubExecs = numSubExecs;
}
TransferBench::TestResults tempResults;
if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
PrintErrors(tempResults.errResults);
} else {
if (tempResults.avgTotalBandwidthGbPerSec > bestResults.avgTotalBandwidthGbPerSec) {
bestResults = tempResults;
bestTransfers = tempTransfers;
}
}
}
PrintResults(ev, ++testNum, bestTransfers, bestResults);
PrintErrors(bestResults.errResults);
}
if (numBytesPerTransfer != 0) break;
currBytes += deltaBytes;
} while (currBytes < bytes * 2);
if (numBytesPerTransfer != 0) break;
}
}
}
void DisplayUsage(char const* cmdName)
{
printf("TransferBench Client v%s (Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
printf("========================================\n");
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
printf("Usage: %s config <N>\n", cmdName);
printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset config:\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
printf("\n");
EnvVars::DisplayUsage();
}
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices) {
if (memDevices.empty()) return "N";
std::stringstream ss;
for (auto const& m : memDevices)
ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
return ss.str();
}
void PrintResults(EnvVars const& ev, int const testNum,
std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
if (!ev.outputToCsv) printf("Test %d:\n", testNum);
// Loop over each executor
for (auto exeInfoPair : results.exeResults) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeResult const& exeResult = exeInfoPair.second;
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
// Loop over each executor
for (int idx : exeResult.transferIdx) {
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];
char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
printf(" Transfer %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
idx, sep,
r.avgBandwidthGbPerSec, sep,
r.avgDurationMsec, sep,
r.numBytes, sep,
MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex,
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information
if (ev.showIterations) {
// Check that per-iteration information exists
if (r.perIterMsec.size() != numTimedIterations) {
printf("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
numTimedIterations, r.perIterMsec.size());
exit(1);
}
// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i+1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
// Loop over iterations (fastest to slowest)
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if (time.second - 1 < r.perIterCUs.size()) {
printf(" CUs:");
for (auto x : r.perIterCUs[time.second - 1]) {
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
sep, results.avgTotalBandwidthGbPerSec,
sep, results.avgTotalDurationMsec,
sep, results.totalBytesTransferred,
sep, results.overheadMsec);
}
void CheckForError(ErrResult const& error)
{
switch (error.errType) {
case ERR_NONE: return;
case ERR_WARN:
printf("[WARN] %s\n", error.errMsg.c_str());
return;
case ERR_FATAL:
printf("[ERROR] %s\n", error.errMsg.c_str());
exit(1);
default:
break;
}
}
void PrintErrors(std::vector<ErrResult> const& errors)
{
bool isFatal = false;
for (auto const& err : errors) {
printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
isFatal |= (err.errType == ERR_FATAL);
}
if (isFatal) exit(1);
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
// TransferBench client version
#define CLIENT_VERSION "1.54.00"
#include "TransferBench.hpp"
#include "EnvVars.hpp"
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);
char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"};
// Display detected hardware
void DisplayTopology(bool outputToCsv);
// Display usage instructions
void DisplayUsage(char const* cmdName);
// Print TransferBench test results
void PrintResults(EnvVars const& ev, int const testNum,
std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results);
// Helper function that converts MemDevices to a string
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices);
// Helper function to print warning / exit on fatal error
void CheckForError(ErrResult const& error);
// Helper function to print list of errors
void PrintErrors(std::vector<ErrResult> const& errors);
This diff is collapsed.
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2
};
char a2aModeStr[3][20] = {"Copy", "Read-Only", "Write-Only"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
// Force to gfx unroll 2 unless explicitly set
ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int a2aMode = EnvVars::GetEnvVar("A2A_MODE" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT" , a2aDirect , a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , a2aMode , a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs);
ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
printf("\n");
}
// Validate env vars
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2\n");
exit(1);
}
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
// Collect the number of GPU devices to use
int const numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
int const numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
if (numSrcs) transfer.srcs.push_back({memType, i});
if (numDsts) transfer.dsts.push_back({memType, j});
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfer.numSubExecs = numSubExecs;
reIndex[std::make_pair(i,j)] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
if (transfers.size() == 0) return;
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
printf("==========================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minExecutorBandwidth = std::numeric_limits<double>::max();
double maxExecutorBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
double executorBandwidth = 0;
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src,dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
executorBandwidth = std::max(executorBandwidth,
results.exeResults[transfers[transferIdx].exeDevice].avgBandwidthGbPerSec);
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
colTotalBandwidth[numGpus] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus],
separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void HealthCheckPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#endif
bool hasFail = false;
// Force use of single stream
ev.useSingleStream = 1;
TransferBench::TestResults results;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numGpuDevices != 8) {
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
exit(1);
}
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
{
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
exit(1);
}
}
// Pass limits
double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
double a2aLimit = getenv("LIMIT_A2A") ? atof(getenv("LIMIT_A2A")) : (int)(45 * 0.95);
// Run CPU to GPU
// Run unidirectional read from CPU to GPU
printf("Testing unidirectional reads from CPU ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.srcs = {{MEM_CPU, memIndex}};
t.dsts = {};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run unidirectional write from GPU to CPU
printf("Testing unidirectional writes to CPU ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.srcs = {};
t.dsts = {{MEM_CPU, memIndex}};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run bidirectional tests
printf("Testing bidirectional reads + writes ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(2);
Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1];
t0.exeDevice = {EXE_GPU_GFX, gpuId};
t0.numBytes = 64*1024*1024;
t0.srcs = {{MEM_CPU, memIndex}};
t0.dsts = {};
t1.exeDevice = {EXE_GPU_GFX, gpuId};
t1.numBytes = 64*1024*1024;
t1.srcs = {};
t1.dsts = {{MEM_CPU, memIndex}};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t0.numSubExecs = cu;
t1.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
results.tfrResults[1].avgBandwidthGbPerSec);
bestResult = std::max(bestResult, sum);
if (sum >= bdirLimit) {
passed = true;
break;
}
} else {
PrintErrors(results.errResults);
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
}
}
}
// Run XGMI tests:
printf("Testing all-to-all XGMI copies "); fflush(stdout);
{
// Force GFX unroll to 2 for MI300
ev.gfxUnroll = 2;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
Transfer t;
t.numBytes = 64*1024*1024;
t.numSubExecs = 8;
t.exeDevice = {EXE_GPU_GFX, i};
t.srcs = {{MEM_GPU_FINE, i}};
t.dsts = {{MEM_GPU_FINE, j}};
transfers.push_back(t);
}
}
std::vector<std::pair<std::pair<int,int>, double>> fails;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
int transferIdx = 0;
for (int i = 0; i < numGpuDevices; i++) {
printf("."); fflush(stdout);
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
if (bw < a2aLimit) {
fails.push_back(std::make_pair(std::make_pair(i,j), bw));
}
transferIdx++;
}
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
}
}
}
exit(hasFail ? 1 : 0);
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void OneToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numDetectedGpus < 2) {
printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
exit(1);
}
// Collect env vars for this preset
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
int exeIndex = EnvVars::GetEnvVar("EXE_INDEX", 0);
int sweepDir = EnvVars::GetEnvVar("SWEEP_DIR", 0);
std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST", "G");
std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE", "G");
std::string sweepSrc = EnvVars::GetEnvVar("SWEEP_SRC", "G");
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", numGpuDevices);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[One-To-All Related]\n");
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
ev.Print("NUM_GPU_SE", numSubExecs, "Using %d subExecutors/CUs per Transfer", numSubExecs);
ev.Print("EXE_INDEX", exeIndex, "Executing on GPU %d", exeIndex);
ev.Print("SWEEP_DIR", sweepDir, "Direction of transfer");
ev.Print("SWEEP_DST", sweepDst.c_str(), "DST memory types to sweep");
ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor type to use");
ev.Print("SWEEP_MAX", sweepMax, "Maximum number of peers");
ev.Print("SWEEP_MIN", sweepMin, "Minimum number of peers");
ev.Print("SWEEP_SRC", sweepSrc.c_str(), "SRC memory types to sweep");
printf("\n");
}
// Perform validation
for (auto ch : sweepExe) {
if (ch != 'G' && ch != 'D') {
printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
exit(1);
}
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
char const sep = (ev.outputToCsv ? ',' : ' ');
for (char src : sweepSrc) for (char exe : sweepExe) for (char dst : sweepDst) {
// Skip invalid configurations
if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) continue;
printf("Executing (%c%s -> %c%d -> %c%s)\n",
src, src == 'N' ? "" : (sweepDir == 0 ? std::to_string(exeIndex).c_str() : std::string("*").c_str()),
exe, exeIndex,
dst, dst == 'N' ? "" : sweepDir == 0 ? std::string("*").c_str() : std::to_string(exeIndex).c_str());
for (int i = 0; i < numGpuDevices; i++) {
if (i == exeIndex) continue;
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv) {
for (int i = 0; i < numGpuDevices-1; i++)
printf("-------------");
printf("\n");
}
for (int p = sweepMin; p <= sweepMax; p++) {
for (int bitmask = 0; bitmask < (1<<numGpuDevices); bitmask++) {
if (bitmask & (1<<exeIndex) || __builtin_popcount(bitmask) != p) continue;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
if (bitmask & (1<<i)) {
Transfer t;
CheckForError(TransferBench::CharToExeType(exe, t.exeDevice.exeType));
t.exeDevice.exeIndex = exeIndex;
t.exeSubIndex = -1;
t.numSubExecs = numSubExecs;
t.numBytes = numBytesPerTransfer;
if (src == 'N') {
t.srcs.clear();
} else {
t.srcs.resize(1);
CheckForError(TransferBench::CharToMemType(src, t.srcs[0].memType));
t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
}
if (dst == 'N') {
t.dsts.clear();
} else {
t.dsts.resize(1);
CheckForError(TransferBench::CharToMemType(dst, t.dsts[0].memType));
t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
}
transfers.push_back(t);
}
}
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
int counter = 0;
for (int i = 0; i < numGpuDevices; i++) {
if (bitmask & (1<<i))
printf(" %8.3f %c", results.tfrResults[counter++].avgBandwidthGbPerSec, sep);
else if (i != exeIndex)
printf(" %c", sep);
}
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++) {
printf(" (%s %c%d %s)",
MemDevicesToStr(transfers[i].srcs).c_str(),
ExeTypeStr[transfers[i].exeDevice.exeType], transfers[i].exeDevice.exeIndex,
MemDevicesToStr(transfers[i].dsts).c_str());
}
printf("\n");
}
}
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment