Commit f3af1da6 authored by Andriy Roshchenko's avatar Andriy Roshchenko
Browse files

Merge remote-tracking branch 'internal/andriy/lwpck-2788' into andriy/lwpck-2788

parents 2bef5501 60b885ae
......@@ -541,7 +541,7 @@ endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
add_compile_options(-fcolor-diagnostics)
# add_compile_options(-fcolor-diagnostics)
endif()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9)
add_compile_options(-fdiagnostics-color=always)
......
{
"version": 3,
"configurePresets": [
{
"name": "linux-debug",
"displayName": "Linux Debug",
"hidden": true,
"generator": "Unix Makefiles",
"binaryDir": "${sourceDir}/build/${presetName}",
"installDir": "${sourceDir}/build/install/${presetName}",
"environment": {
"MY_ENVIRONMENT_VARIABLE": "NONE",
"PATH": "/usr/local/.cargo/bin:$penv{PATH}",
"SCCACHE_IDLE_TIMEOUT": "11000"
},
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"BUILD_DEV": "ON",
"CMAKE_CXX_COMPILER": "/opt/rocm/bin/hipcc",
"CMAKE_PREFIX_PATH": "/opt/rocm",
"CMAKE_CXX_COMPILER_LAUNCHER": "sccache",
"CMAKE_C_COMPILER_LAUNCHER": "sccache"
},
"condition": {
"type": "equals",
"lhs": "${hostSystemName}",
"rhs": "Linux"
}
},
{
"name": "MI355-debug",
"displayName": "MI355 Debug",
"inherits": "linux-debug",
"description": "Development Environment for MI355.",
"cacheVariables": {
"GPU_TARGETS": "gfx950",
"CMAKE_BUILD_TYPE": "Debug",
"CMAKE_CXX_FLAGS": "-O0 -ggdb"
}
},
{
"name": "MI355-release",
"displayName": "MI355 Release",
"inherits": "linux-debug",
"cacheVariables": {
"GPU_TARGETS": "gfx950",
"CMAKE_BUILD_TYPE": "Release",
"CMAKE_CXX_FLAGS": "-O3"
}
},
{
"name": "MI300X-release",
"displayName": "MI300X Release",
"inherits": "linux-debug",
"cacheVariables": {
"GPU_TARGETS": "gfx942",
"CMAKE_BUILD_TYPE": "Release",
"CMAKE_CXX_FLAGS": "-O3"
}
},
{
"name": "MI250-release",
"displayName": "MI250 Release",
"inherits": "linux-debug",
"cacheVariables": {
"GPU_TARGETS": "gfx90a",
"CMAKE_BUILD_TYPE": "Release",
"CMAKE_CXX_FLAGS": "-O3",
"CK_USE_FP8_ON_UNSUPPORTED_ARCH":"ON"
}
},
{
"name": "MI250-debug",
"displayName": "MI250 Debug",
"inherits": "linux-debug",
"cacheVariables": {
"GPU_TARGETS": "gfx90a",
"CMAKE_BUILD_TYPE": "Debug",
"CMAKE_CXX_FLAGS": "-O0 -ggdb",
"CK_USE_FP8_ON_UNSUPPORTED_ARCH":"ON"
}
},
{
"name": "RX7800-release",
"displayName": "RX7800 Release",
"inherits": "linux-debug",
"cacheVariables": {
"GPU_TARGETS": "gfx1101",
"DL_KERNELS": "ON",
"CMAKE_BUILD_TYPE": "Release",
"CMAKE_CXX_FLAGS": "-O3"
}
},
{
"name": "RX7800-debug",
"displayName": "RX7800 Debug",
"inherits": "linux-debug",
"cacheVariables": {
"GPU_TARGETS": "gfx1101",
"DL_KERNELS": "ON",
"CMAKE_BUILD_TYPE": "Debug",
"CMAKE_CXX_FLAGS": "-O0 -ggdb"
}
}
],
"buildPresets": [
{
"name": "Debug",
"hidden": true,
"configuration": "Debug"
},
{
"name": "Release",
"hidden": true,
"configuration": "Release"
},
{
"name": "MI355-debug",
"displayName": "MI355",
"configurePreset": "MI355-debug",
"description": "Build Environment for MI355 Debug.",
"inherits": [
"Debug"
],
"jobs": 128
},
{
"name": "MI355-release",
"displayName": "MI355",
"configurePreset": "MI355-release",
"description": "Build Environment for MI355 Release.",
"inherits": [
"Release"
],
"jobs": 128
},
{
"name": "MI300X-release",
"displayName": "MI300X",
"configurePreset": "MI300X-release",
"description": "Build Environment for MI300X Release.",
"inherits": [
"Release"
],
"jobs": 128
},
{
"name": "MI250-release",
"displayName": "MI250",
"configurePreset": "MI250-release",
"description": "Build Environment for MI250 Release.",
"inherits": [
"Release"
],
"jobs": 128
},
{
"name": "MI250-debug",
"displayName": "MI250",
"configurePreset": "MI250-debug",
"description": "Build Environment for MI250 Debug.",
"inherits": [
"Debug"
],
"jobs": 128
},
{
"name": "RX7800-release",
"displayName": "RX7800",
"configurePreset": "RX7800-release",
"description": "Build Environment for RX7800 Release.",
"inherits": [
"Release"
],
"jobs": 128
},
{
"name": "RX7800-debug",
"displayName": "RX7800",
"configurePreset": "RX7800-debug",
"description": "Build Environment for RX7800 Debug.",
"inherits": [
"Debug"
],
"jobs": 128
}
]
}
......@@ -359,6 +359,21 @@ struct GeneratorTensor_Sequential
}
};
template <ck::index_t Dim>
struct GeneratorTensor_Sequential<ck::e8m0_bexp_t, Dim>
{
int offset = 0;
template <typename... Ts>
ck::e8m0_bexp_t operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
int tmp = dims[Dim];
return ck::type_convert<ck::e8m0_bexp_t>(powf(2, tmp + offset));
}
};
template <typename T, size_t NumEffectiveDim = 2>
struct GeneratorTensor_Diagonal
{
......
......@@ -780,7 +780,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
}
};
// TODO: fix mfma...f8f6f4 instructions
template <>
struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
{
......@@ -847,9 +846,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
// clang-format on
template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
__device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
__device__ void run(const FloatA& a,
const int32_t& scale_a,
const FloatB& b,
const int32_t& scale_b,
FloatC& reg_c) const
{
intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
a, scale_a, b, scale_b, reg_c);
}
};
......@@ -871,9 +875,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
// clang-format on
template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
__device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
{
intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
__device__ void run(const FloatA& a,
const int32_t& scale_a,
const FloatB& b,
const int32_t& scale_b,
FloatC& reg_c) const
{
intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
a, scale_a, b, scale_b, reg_c);
}
};
......
......@@ -519,12 +519,36 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
{
template <class FloatC>
__device__ static void Run(const f8x32_t& reg_a,
const int32_t scale_a,
const int32_t& scale_a,
const f8x32_t& reg_b,
const int32_t scale_b,
const int32_t& scale_b,
FloatC& reg_c)
{
#if defined(__gfx950__)
if(threadIdx.x == 0 || threadIdx.x == 32)
{
printf("thread: %u -- xA: %x\n", threadIdx.x, static_cast<uint32_t>(scale_a));
printf("thread: %u -- xB: %x\n", threadIdx.x, static_cast<uint32_t>(scale_b));
// printf("intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32> thread: %u -- scale_a: %f\n",
// threadIdx.x,
// static_cast<float>(ck::e8m0_bexp_t(scale_a)));
// printf("intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32> thread: %u -- scale_b: %f\n",
// threadIdx.x,
// static_cast<float>(ck::e8m0_bexp_t(scale_b)));
// for(size_t i = 0; i < 32; i++)
// {
// printf("thread: %u -- reg_a[%zu]: %f\n",
// threadIdx.x,
// i,
// type_convert<float>(f8_t{static_cast<f8x32_t::data_v>(reg_a)[i]}));
// // printf("thread: %u -- reg_a[%zu]: %f\n",
// // threadIdx.x,
// // i,
// // type_convert<float>(f8_t{static_cast<f8x32_t::data_v>(reg_b)[i]}));
// }
}
// https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
reg_c.template AsType<float16_t>()(Number<0>{}) =
__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
......
......@@ -30,11 +30,11 @@ bool run_mfma_test(ck::index_t init)
constexpr auto BLOCK_N = mfma_instr.n_per_blk;
constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
const auto mx_mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;
const auto mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;
bool pass = true;
pass = ck::mfma_test::TestMFMA<decltype(mx_mfma_kernel),
pass = ck::mfma_test::TestMFMA<decltype(mfma_kernel),
AType,
BType,
CType,
......@@ -45,7 +45,7 @@ bool run_mfma_test(ck::index_t init)
CLayout,
BLOCK_M,
BLOCK_N,
BLOCK_K>{}(mx_mfma_kernel, init);
BLOCK_K>{}(mfma_kernel, init);
return pass;
}
......@@ -63,3 +63,98 @@ TEST(MFMA, FP8MFMA32x32x64)
auto pass = run_mfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
EXPECT_TRUE(pass);
}
/**
* @brief Run the test for the given MX MFMA instruction
*
* @param init - selects initialization algorithm for A and B tensors
*/
template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
bool run_mxmfma_test(ck::index_t init)
{
static_assert(mfma == ck::MFMA_F8F6F4::SCALE_F32_16x16x128 ||
mfma == ck::MFMA_F8F6F4::SCALE_F32_32x32x64,
"Only SCALE_F32_16x16x128 and SCALE_F32_32x32x64 are supported");
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AccType = float; // only MFMA_F32 instructions supported
// using CPUAccType = AccType;
using ScaleType = ck::e8m0_bexp_t; // biased exponent type
ck::mfma_type<static_cast<ck::MfmaInstr>(mfma)> mfma_instr;
constexpr auto BLOCK_M = mfma_instr.m_per_blk;
constexpr auto BLOCK_N = mfma_instr.n_per_blk;
constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
constexpr auto BLOCK_X = 32; // scaling vector size
const auto mx_mfma_kernel =
ck::matmul<AType, BType, ScaleType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, BLOCK_X>;
bool pass = true;
pass = ck::mxmfma_test::TestMXMFMA<decltype(mx_mfma_kernel),
AType,
BType,
ScaleType,
CType,
ALayout,
BLayout,
CLayout,
BLOCK_M,
BLOCK_N,
BLOCK_K,
BLOCK_X>{}(mx_mfma_kernel, init);
return pass;
}
TEST(MXMFMA, MXFP8MFMA16x16x128i2)
{
auto AB_init = 2;
auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
EXPECT_TRUE(pass);
}
TEST(MXMFMA, MXFP8MFMA32x32x64i2)
{
auto AB_init = 2;
auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
EXPECT_TRUE(pass);
}
TEST(MXMFMA, MXFP8MFMA16x16x128i3)
{
auto AB_init = 3;
auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
EXPECT_TRUE(pass);
}
TEST(MXMFMA, MXFP8MFMA32x32x64i3)
{
auto AB_init = 3;
auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
EXPECT_TRUE(pass);
}
TEST(MXMFMA, MXFP8MFMA16x16x128i4)
{
auto AB_init = 4;
auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
EXPECT_TRUE(pass);
}
TEST(MXMFMA, MXFP8MFMA32x32x64i4)
{
auto AB_init = 4;
auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
EXPECT_TRUE(pass);
}
TEST(MXMFMA, MXFP8MFMA32x32x64i5)
{
auto AB_init = 5;
auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
EXPECT_TRUE(pass);
}
This diff is collapsed.
#include <hip/hip_ext.h>
#include <hip/hip_runtime.h>
__global__ void kernel()
{
using dataAB = uint8_t __attribute__((ext_vector_type(32)));
using dataC = float __attribute__((ext_vector_type(16)));
using dataX = int32_t __attribute__((ext_vector_type(2)));
dataAB regA(0x38);
dataAB regB(0x38);
dataC regC(1.0f);
// dataC regCin(1.0f);
#if 1
// dataX xa{127, 127}; // 1.0
dataX xa(127 & 0xFF); // 1.0
dataX xb(127 & 0xFF); // 1.0
#else
dataX xa(0);
dataX xb(0);
#endif
#if 0
if(threadIdx.x == 0)
{
// xa = 127; // 1.0
for(size_t i = 0; i < 32; i++)
{
regA[i] = 0x38; // 1.0
}
for(size_t i = 0; i < 32; i++)
{
regB[i] = 0x38; // 1.0
}
printf("thread: %u -- xA: %x\n", threadIdx.x, xa[threadIdx.x / 32]);
printf("thread: %u -- xB: %x\n", threadIdx.x, xb[threadIdx.x / 32]);
}
if(threadIdx.x == 32)
{
// xa = 126; // 0.5
for(size_t i = 0; i < 32; i++)
{
regA[i] = 0xC0; // -2.0
}
for(size_t i = 0; i < 32; i++)
{
regB[i] = 0x38; // 1.0
}
printf("thread: %u -- xA: %x\n", threadIdx.x, xa[threadIdx.x / 32]);
printf("thread: %u -- xB: %x\n", threadIdx.x, xb[threadIdx.x / 32]);
}
#endif
__syncthreads();
printf("thread: %u -- regA: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x "
"%x %x %x %x %x %x %x %x %x %x\n",
threadIdx.x,
regA[0],
regA[1],
regA[2],
regA[3],
regA[4],
regA[5],
regA[6],
regA[7],
regA[8],
regA[9],
regA[10],
regA[11],
regA[12],
regA[13],
regA[14],
regA[15],
regA[16],
regA[17],
regA[18],
regA[19],
regA[20],
regA[21],
regA[22],
regA[23],
regA[24],
regA[25],
regA[26],
regA[27],
regA[28],
regA[29],
regA[30],
regA[31]);
printf("thread: %u -- regB: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x "
"%x %x %x %x %x %x %x %x %x %x\n",
threadIdx.x,
regB[0],
regB[1],
regB[2],
regB[3],
regB[4],
regB[5],
regB[6],
regB[7],
regB[8],
regB[9],
regB[10],
regB[11],
regB[12],
regB[13],
regB[14],
regB[15],
regB[16],
regB[17],
regB[18],
regB[19],
regB[20],
regB[21],
regB[22],
regB[23],
regB[24],
regB[25],
regB[26],
regB[27],
regB[28],
regB[29],
regB[30],
regB[31]);
//__builtin_amdgcn_mfma_ld_scale_b32(xb[threadIdx.x / 32], 0, 0);
regC = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(regA,
regB,
regC,
0, // cbsz
0, // blgp
0,
xa[threadIdx.x / 32],
0,
xb[threadIdx.x / 32]);
__syncthreads();
printf("thread: %u -- regC: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",
threadIdx.x,
regC[0],
regC[1],
regC[2],
regC[3],
regC[4],
regC[5],
regC[6],
regC[7],
regC[8],
regC[9],
regC[10],
regC[11],
regC[12],
regC[13],
regC[14],
regC[15]);
// printf("thread: %u -- regCin: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",
// threadIdx.x,
// regCin[0],
// regCin[1],
// regCin[2],
// regCin[3],
// regCin[4],
// regCin[5],
// regCin[6],
// regCin[7],
// regCin[8],
// regCin[9],
// regCin[10],
// regCin[11],
// regCin[12],
// regCin[13],
// regCin[14],
// regCin[15]);
}
int main()
{
kernel<<<1, 64>>>();
return 0;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment