Commit d8ca0a9e authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "hip/hip_runtime.h"
#include "common.h"
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count/nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = count/nranks;
*recvInplaceOffset = 0;
*paramcount = count/nranks;
}
testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) {
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
}
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankOffset = count * wordSize(type);
if (count == 0) return testSuccess;
NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
if (rank == root) {
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
}
}
NCCLCHECK(ncclGroupEnd());
return testSuccess;
}
struct testColl gatherTest = {
"Gather",
GatherGetCollByteCount,
GatherInitData,
GatherGetBw,
GatherRunColl
};
void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &gatherTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
GatherGetBuffSize,
GatherRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "hip/hip_runtime.h"
#include "common.h"
#define ALIGN 4
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
*recvcount = base*nranks;
*sendInplaceOffset = base;
*recvInplaceOffset = 0;
*paramcount = base;
}
testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
*algBw = baseBw;
double factor = 1;
*busBw = baseBw * factor;
}
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
char* sbuff = (char*)sendbuff;
char* rbuff = (char*)recvbuff;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankSize = count * wordSize(type);
if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
// Hypercube AllGather
for (int mask=1; mask<nRanks; mask<<=1) {
NCCLCHECK(ncclGroupStart());
int s = rank & ~(mask-1);
int r = s ^ mask;
NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
NCCLCHECK(ncclGroupEnd());
}
return testSuccess;
}
struct testColl hyperCubeTest = {
"HyperCube",
HyperCubeGetCollByteCount,
HyperCubeInitData,
HyperCubeGetBw,
HyperCubeRunColl
};
void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &hyperCubeTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
// Check if this is a power of 2
int nRanks = args->nProcs*args->nThreads*args->nGpus;
if (nRanks && !(nRanks & (nRanks - 1))) {
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
}
} else {
printf("nRanks %d is not a power of 2, skipping\n", nRanks);
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
HyperCubeGetBuffSize,
HyperCubeRunTest
};
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL1_COMPAT_H
#define NCCL1_COMPAT_H
#ifndef NCCL_MAJOR // NCCL 1.x
#define NCCL_MAJOR 1
#define NCCL_MINOR 0
#define ncclNumOps nccl_NUM_OPS
#define ncclNumTypes nccl_NUM_TYPES
static ncclResult_t ncclGroupStart() { return ncclSuccess; }
static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
}
static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
}
static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return ncclBcast(buff, (int)count, datatype, root, comm, stream);
}
static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream) {
CHECKCOUNT(recvcount);
return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
}
static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(sendcount);
return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
}
#endif
#endif
/**
* MIT License
*
* Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*!\file
* \brief rccl_bfloat16.h provides struct for rccl_bfloat16 typedef
*/
#ifndef _RCCL_BFLOAT16_H_
#define _RCCL_BFLOAT16_H_
#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
// include a minimal definition of rccl_bfloat16
#include <stdint.h>
/*! \brief Struct to represent a 16 bit brain floating point number. */
typedef struct
{
uint16_t data;
} rccl_bfloat16;
#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <hip/hip_runtime.h>
#include <ostream>
#include <type_traits>
struct rccl_bfloat16
{
uint16_t data;
enum truncate_t
{
truncate
};
__host__ __device__ rccl_bfloat16() = default;
// round upper 16 bits of IEEE float to convert to bfloat16
explicit __host__ __device__ rccl_bfloat16(float f)
: data(float_to_bfloat16(f))
{
}
explicit __host__ __device__ rccl_bfloat16(float f, truncate_t)
: data(truncate_float_to_bfloat16(f))
{
}
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
__host__ __device__ operator float() const
{
union
{
uint32_t int32;
float fp32;
} u = {uint32_t(data) << 16};
return u.fp32;
}
private:
static __host__ __device__ uint16_t float_to_bfloat16(float f)
{
union
{
float fp32;
uint32_t int32;
} u = {f};
if(~u.int32 & 0x7f800000)
{
// When the exponent bits are not all 1s, then the value is zero, normal,
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// least significant bits of the float mantissa are greater than 0x8000,
// or if they are equal to 0x8000 and the least significant bit of the
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// the exponent is incremented by one, which is the next higher FP value
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// incrementing it causes it to become an exponent of 0xFF and a mantissa
// of 0x00, which is Inf, the next higher value to the unrounded value.
u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
}
else if(u.int32 & 0xffff)
{
// When all of the exponent bits are 1, the value is Inf or NaN.
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
// bit being 1. Signaling NaN is indicated by the most significant
// mantissa bit being 0 but some other bit(s) being 1. If any of the
// lower 16 bits of the mantissa are 1, we set the least significant bit
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
// the bloat16's mantissa bits are all 0.
u.int32 |= 0x10000; // Preserve signaling NaN
}
return uint16_t(u.int32 >> 16);
}
// Truncate instead of rounding, preserving SNaN
static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
{
union
{
float fp32;
uint32_t int32;
} u = {f};
return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
}
};
typedef struct
{
uint16_t data;
} rccl_bfloat16_public;
static_assert(std::is_standard_layout<rccl_bfloat16>{},
"rccl_bfloat16 is not a standard layout type, and thus is "
"incompatible with C.");
static_assert(std::is_trivial<rccl_bfloat16>{},
"rccl_bfloat16 is not a trivial type, and thus is "
"incompatible with C.");
static_assert(sizeof(rccl_bfloat16) == sizeof(rccl_bfloat16_public)
&& offsetof(rccl_bfloat16, data) == offsetof(rccl_bfloat16_public, data),
"internal rccl_bfloat16 does not match public rccl_bfloat16");
inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16)
{
return os << float(bf16);
}
inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
{
return a;
}
inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
{
a.data ^= 0x8000;
return a;
}
inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) + float(b));
}
inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) - float(b));
}
inline __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) * float(b));
}
inline __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) / float(b));
}
inline __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
{
return float(a) < float(b);
}
inline __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
{
return float(a) == float(b);
}
inline __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
{
return b < a;
}
inline __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
{
return !(a > b);
}
inline __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
{
return !(a == b);
}
inline __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
{
return !(a < b);
}
inline __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a + b;
}
inline __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a - b;
}
inline __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a * b;
}
inline __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a / b;
}
inline __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
{
return a += rccl_bfloat16(1.0f);
}
inline __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
{
return a -= rccl_bfloat16(1.0f);
}
inline __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
{
rccl_bfloat16 orig = a;
++a;
return orig;
}
inline __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
{
rccl_bfloat16 orig = a;
--a;
return orig;
}
namespace std
{
constexpr __host__ __device__ bool isinf(rccl_bfloat16 a)
{
return !(~a.data & 0x7f80) && !(a.data & 0x7f);
}
constexpr __host__ __device__ bool isnan(rccl_bfloat16 a)
{
return !(~a.data & 0x7f80) && +(a.data & 0x7f);
}
constexpr __host__ __device__ bool iszero(rccl_bfloat16 a)
{
return !(a.data & 0x7fff);
}
inline rccl_bfloat16 sin(rccl_bfloat16 a)
{
return rccl_bfloat16(sinf(float(a)));
}
inline rccl_bfloat16 cos(rccl_bfloat16 a)
{
return rccl_bfloat16(cosf(float(a)));
}
}
#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
#endif // _RCCL_BFLOAT16_H_
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
*busBw = baseBw;
}
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
return testSuccess;
}
struct testColl reduceTest = {
"Reduce",
ReduceGetCollByteCount,
ReduceInitData,
ReduceGetBw,
ReduceRunColl
};
void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &reduceTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
for (int k=begin_root; k<=end_root; k++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
}
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
ReduceGetBuffSize,
ReduceRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
#define ALIGN 4
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base*nranks;
*recvcount = base;
*sendInplaceOffset = 0;
*recvInplaceOffset = base;
*paramcount = base;
}
testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks - 1))/((double)nranks);
*busBw = baseBw * factor;
}
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
struct testColl reduceScatterTest = {
"ReduceScatter",
ReduceScatterGetCollByteCount,
ReduceScatterInitData,
ReduceScatterGetBw,
ReduceScatterRunColl
};
void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &reduceScatterTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
run_ops = &op;
run_opnames = &opName;
op_count = 1;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
ReduceScatterGetBuffSize,
ReduceScatterRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = count/nranks;
*sendInplaceOffset = 0;
*recvInplaceOffset = count/nranks;
*paramcount = count/nranks;
}
testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankOffset = count * wordSize(type);
if (count == 0) return testSuccess;
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
}
}
NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return testSuccess;
}
struct testColl scatterTest = {
"Scatter",
ScatterGetCollByteCount,
ScatterInitData,
ScatterGetBw,
ScatterRunColl
};
void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &scatterTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
ScatterGetBuffSize,
ScatterRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
int peer = (rank-1+nranks)%nranks;
TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
// We don't support in-place sendrecv
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}
void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = 1;
*busBw = baseBw * factor;
}
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
int recvPeer = (rank-1+nRanks) % nRanks;
int sendPeer = (rank+1) % nRanks;
NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
NCCLCHECK(ncclGroupEnd());
return testSuccess;
}
struct testColl sendRecvTest = {
"SendRecv",
SendRecvGetCollByteCount,
SendRecvInitData,
SendRecvGetBw,
SendRecvRunColl
};
void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &sendRecvTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
SendRecvGetBuffSize,
SendRecvRunTest
};
#include "timer.h"
// Make sure to compile this translation unit with the host compiler and not
// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
#include <chrono>
namespace {
std::uint64_t now() {
using clock = std::chrono::steady_clock;
return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
}
}
timer::timer() {
t0 = now();
}
double timer::elapsed() const {
std::uint64_t t1 = now();
return 1.e-9*(t1 - t0);
}
double timer::reset() {
std::uint64_t t1 = now();
double ans = 1.e-9*(t1 - t0);
t0 = t1;
return ans;
}
#ifndef _408319ecdd5b47b28bf8f511c4fdf816
#define _408319ecdd5b47b28bf8f511c4fdf816
#include <cstdint>
// Can't include <chrono> because of bug with gcc 10.3.0
class timer {
std::uint64_t t0;
public:
timer();
double elapsed() const;
double reset();
};
#endif
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
def pytest_addoption(parser):
parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/all_gather_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/all_reduce_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllReduce test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllReduce test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/broadcast_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Broadcast test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Broadcast test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/reduce_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Reduce test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Reduce test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/reduce_scatter_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("ReduceScatter test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("ReduceScatter test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
#include ../../makefiles/common.mk
.PHONY: all clean
BUILDDIR := $(abspath ../../build)
DST_DIR := $(BUILDDIR)/test/verifiable
ROCM_PATH ?= /opt/rocm
MPI_HOME ?= /usr/lib/openmpi
PREFIX ?= /usr/local
VERBOSE ?= 0
DEBUG ?= 0
NCCL_HOME ?= ""
HIPCC = $(ROCM_PATH)/bin/hipcc
CXX = $(HIPCC)
HIPCUFLAGS := -std=c++14
LDFLAGS :=
HIPLDFLAGS :=
ifneq ($(NCCL_HOME), "")
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
endif
HIPCUFLAGS += -I$(ROCM_PATH)/include
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
ifeq ($(DEBUG), 0)
HIPCUFLAGS += -O3
else
HIPCUFLAGS += -O0 -g -ggdb3
endif
ifeq ($(VERBOSE), 0)
.SILENT:
endif
ifeq ($(MPI), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
else ifeq ($(MPICH), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
HIPLDFLAGS += -L/usr/lib -lmpich
endif
LIBRARIES += rccl
HIPLDFLAGS += $(LIBRARIES:%=-l%)
all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test
clean:
rm -rf $(DST_DIR)
TEST_VERIFIABLE_SRCDIR := .
TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
include verifiable.mk
self_test: $(DST_DIR)/self_test
$(DST_DIR)/self_test: verifiable.cu verifiable.h
@printf "Linking %s\n" $@
@mkdir -p $(DST_DIR)
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/* Generate parameters for our error bound model of floating point average
* (sum of scaled values) by sampling sums of random sequences for each
* floating point type.
*
* The model has parameters "coef" and "power", where for two floats a & b,
* they are close enough if and only if:
* abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
*
* Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
*
* Compile with:
* nvcc -gencode=arch=compute_80,code=sm_80
*/
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdint>
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp16.h>
using std::uint64_t;
using std::uint32_t;
using bfloat16 = hip_bfloat16;
template<typename T>
struct float_traits;
template<>
struct float_traits<float> {
static constexpr int mantissa_bits = 23;
static constexpr int exponent_bits = 8;
using uint_t = uint32_t;
__device__ static float make(double x) { return (float)x; }
__device__ static float make(uint64_t x) { return (float)x; }
__device__ static double todouble(float x) { return x; }
__device__ static float add(float a, float b) { return a+b; }
__device__ static float mul(float a, float b) { return a*b; }
};
template<>
struct float_traits<double> {
static constexpr int mantissa_bits = 52;
static constexpr int exponent_bits = 11;
using uint_t = uint64_t;
__device__ static double make(double x) { return x; }
__device__ static double make(uint64_t x) { return (double)x; }
__device__ static double todouble(double x) { return x; }
__device__ static double add(double a, double b) { return a+b; }
__device__ static double mul(double a, double b) { return a*b; }
};
template<>
struct float_traits<__half> {
static constexpr int mantissa_bits = 10;
static constexpr int exponent_bits = 5;
using uint_t = uint16_t;
__device__ static __half make(double x) { return __float2half((float)x); }
__device__ static __half make(uint64_t x) { return __int2half_rn(x); }
__device__ static double todouble(__half x) { return __half2float(x); }
__device__ static __half add(__half a, __half b) { return __hadd(a, b); }
__device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
};
template<>
struct float_traits<bfloat16> {
static constexpr int mantissa_bits = 7;
static constexpr int exponent_bits = 8;
using uint_t = uint16_t;
__device__ static bfloat16 make(double x) { return bfloat16(x); }
__device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
__device__ static double todouble(bfloat16 x) { return double(x); }
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
};
template<typename F>
__device__ int compare(F a, F b) {
union { typename float_traits<F>::uint_t ua; F fa; };
union { typename float_traits<F>::uint_t ub; F fb; };
ua=0; ub=0;
fa=a; fb=b;
//std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
return ua < ub ? ub-ua : ua-ub;
}
struct xoshiro256ss {
uint64_t s[4];
__device__ xoshiro256ss(int seed) {
constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
for(int i=0; i < 4; i++)
s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
}
__device__ uint64_t operator()() {
auto rol64 = [](uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
};
uint64_t const result = rol64(s[1] * 5, 7) * 9;
uint64_t const t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rol64(s[3], 45);
return result;
}
};
static __device__ int __reduce_max_sync(unsigned int mask, int value)
{
//We ignore mask, since all bits are set when calling them in the
//test code below.
int width = warpSize;
for (unsigned int i = warpSize; i; i >>= 1) {
value = max(__shfl_down(value, i, width), value);
}
return value;
}
template<typename F>
__global__ void kernel() {
using traits = float_traits<F>;
constexpr int samps = 4<<10;
__shared__ F accf[samps];
__shared__ double accd[samps];
xoshiro256ss rng(threadIdx.x);
float expo_avg = 1;
for(int pass=0; pass < 2; pass++) {
F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
int err_max = 0;
float coef = 0;
double expo_sum = 0;
int expo_n = 0;
int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
//for(int round=0; round < 2; round++) {
for(int i=threadIdx.x; i < samps; i += blockDim.x) {
accf[i] = (F)0;
accd[i] = 0;
}
__syncthreads();
for(int r=0; r < max_ranks; r++) {
int err = 0;
for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
F f = traits::make(d);
accf[i] = traits::add(accf[i], traits::mul(scalar, f));
accd[i] += traits::todouble(f);
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
err = err > e ? err : e;
}
err = __reduce_max_sync(-1u, err);
err_max = err_max > err ? err_max : err;
if (r >= 2) {
// err = 1 + coef*pow(r,expo)
float c = float(err-1)/powf(float(r), expo_avg);
coef = coef > c ? coef : c;
}
if (r >= 2) {
double expo = log2f(1+err_max)/log2f(r);
expo_sum += expo;
expo_n++;
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
}
}
}
if(pass==0)
expo_avg = expo_sum/expo_n;
else if(threadIdx.x == 0)
printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
}
}
int main() {
std::printf("type=float:\n");
kernel<float><<<1,32>>>();
hipDeviceSynchronize();
std::printf("\ntype=half:\n");
kernel<half><<<1,32>>>();
hipDeviceSynchronize();
std::printf("\ntype=bfloat16:\n");
kernel<bfloat16><<<1,32>>>();
hipDeviceSynchronize();
return 0;
}
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment