"vscode:/vscode.git/clone" did not exist on "a11cb45c830147339c2ea52a3080201fa57bb35d"
Commit 6aee6155 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed scale logic

parent 3ba485b6
...@@ -82,4 +82,4 @@ if(result EQUAL 0) ...@@ -82,4 +82,4 @@ if(result EQUAL 0)
endif() endif()
add_example_executable(gemm_xdl_input_i16_comp_i8_scale_ab gemm_xdl_input_i16_comp_i8_scale_ab.cpp) add_example_executable(example_gemm_xdl_input_i16_comp_i8_scale_ab gemm_xdl_input_i16_comp_i8_scale_ab.cpp)
...@@ -44,16 +44,34 @@ struct i32_to_i8 ...@@ -44,16 +44,34 @@ struct i32_to_i8
{ {
__host__ __device__ void operator()(I8& y, const I32& x) const __host__ __device__ void operator()(I8& y, const I32& x) const
{ {
y = ck::type_convert<I8>(ck::type_convert<float>(x) * reduced_amex_scale); float scale = (1.0 / reduced_amax) * int8_max;
y = ck::type_convert<I8>(ck::type_convert<float>(x) * scale);
} }
static constexpr float int8_max = 127;
// this attribute will trigger a reduction op of the tensor to get the true amax scalue // this attribute will trigger a reduction op of the tensor to get the true amax scalue
float reduced_amex_scale = 1.0; float reduced_amax = 1.0;
};
struct i8_to_i32
{
__host__ __device__ void operator()(I32& y, const I8& x) const
{
float a_scale = (1.0 / a_reduced_amax) * int8_max;
float b_scale = (1.0 / b_reduced_amax) * int8_max;
float c_scale = (1.0 / (a_scale * b_scale));
y = ck::type_convert<I32>(ck::type_convert<float>(x) * c_scale);
}
static constexpr float int8_max = 127;
float a_reduced_amax = 1.0;
float b_reduced_amax = 1.0;
}; };
using AElementOp = i32_to_i8; using AElementOp = i32_to_i8;
using BElementOp = i32_to_i8; using BElementOp = i32_to_i8;
using CDEElementOp = PassThrough; using CDEElementOp = i8_to_i32;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
...@@ -261,17 +279,18 @@ int main(int argc, char* argv[]) ...@@ -261,17 +279,18 @@ int main(int argc, char* argv[])
auto ref_argument = ref_gemm.MakeArgument(a_m_k, auto ref_argument = ref_gemm.MakeArgument(a_m_k,
b_k_n, b_k_n,
c_m_n, c_m_n,
AElementOp{static_cast<float>(1.0) / amax}, AElementOp{static_cast<float>(amax)},
BElementOp{static_cast<float>(1.0) / bmax}, BElementOp{static_cast<float>(bmax)},
PassThrough{}); PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
auto cde_element_op_ = CDEElementOp{static_cast<float>(amax), static_cast<float>(bmax)};
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n)); cde_element_op_(e_m_n_host_result(m, n), c_m_n(m, n));
} }
} }
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = F16;
using BDataType = F16;
using AccDataType = F32;
using CShuffleDataType = F32;
using DDataType = F16;
using EDataType = F16;
using ALayout = Row;
using BLayout = Col;
using DLayout = Row;
using ELayout = Row;
struct AddScale
{
static constexpr auto I0 = ck::Number<0>{};
static constexpr auto I1 = ck::Number<1>{};
static constexpr auto I2 = ck::Number<2>{};
static constexpr auto I3 = ck::Number<3>{};
__host__ __device__ constexpr void
operator()(ck::half4_t& a, const ck::half4_t& a0, const ck::half4_t& a1) const
{
const auto a0_v_t = ck::vector_type<ck::half_t, 4>{a0};
const auto a1_v_t = ck::vector_type<ck::half_t, 4>{a1};
auto r_v_t = ck::vector_type<ck::half_t, 4>{};
r_v_t.AsType<ck::half_t>()(I0) =
scale * (a0_v_t.AsType<ck::half_t>()[I0] + a1_v_t.AsType<ck::half_t>()[I0]);
r_v_t.AsType<ck::half_t>()(I1) =
scale * (a0_v_t.AsType<ck::half_t>()[I1] + a1_v_t.AsType<ck::half_t>()[I1]);
r_v_t.AsType<ck::half_t>()(I2) =
scale * (a0_v_t.AsType<ck::half_t>()[I2] + a1_v_t.AsType<ck::half_t>()[I2]);
r_v_t.AsType<ck::half_t>()(I3) =
scale * (a0_v_t.AsType<ck::half_t>()[I3] + a1_v_t.AsType<ck::half_t>()[I3]);
a = r_v_t.AsType<ck::half4_t>()[I0];
}
__host__ __device__ constexpr void
operator()(ck::half_t& a, const ck::half_t& a0, const ck::half_t& a1) const
{
a = scale * (a0 + a1);
}
// this attribute will force copy_function applying element_wise with vector_type
static constexpr ck::index_t vec_len = 4;
float scale = 1.0;
};
struct AlphaBetaAdd
{
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename C, typename D>
__host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, float, ck::half_t>(
ck::half_t& e, const float& c, const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * c + beta_ * ck::type_convert<float>(d));
};
float alpha_;
float beta_;
};
using AElementOp = AddScale;
using BElementOp = PassThrough;
using CDEElementOp = AlphaBetaAdd;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl_CShuffle<
ck::Tuple<ALayout, ALayout>,
ck::Tuple<BLayout>,
ck::Tuple<DLayout>,
ELayout,
ck::Tuple<ADataType, ADataType>,
ck::Tuple<BDataType>,
AccDataType,
CShuffleDataType,
ck::Tuple<DDataType>,
EDataType,
AElementOp,
BElementOp,
CDEElementOp,
GemmSpec,
1,
256,
256,
128,
32,
8,
8,
32,
32,
4,
2,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
1,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
1,
1,
1,
S<1, 32, 1, 8>,
8>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideD = 4096;
ck::index_t StrideE = 4096;
float alpha = 1.0f;
float beta = 1.0f;
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 6)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
alpha = std::stof(argv[4]);
beta = std::stof(argv[5]);
}
else if(argc == 13)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideD = std::stoi(argv[9]);
StrideE = std::stoi(argv[10]);
alpha = std::stof(argv[11]);
beta = std::stof(argv[12]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
"beta\n");
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<ADataType> a1_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a0_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
a1_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
break;
default:
a0_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
a1_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
}
DeviceMem a0_device_buf(sizeof(ADataType) * a0_m_k.mDesc.GetElementSpaceSize());
DeviceMem a1_device_buf(sizeof(ADataType) * a1_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a0_device_buf.ToDevice(a0_m_k.mData.data());
a1_device_buf.ToDevice(a1_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d_device_buf.ToDevice(d_m_n.mData.data());
e_device_buf.ToDevice(e_m_n_device_result.mData.data());
auto a_element_op = AElementOp{0.2};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{alpha, beta};
// do GEMM
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
a1_device_buf.GetDeviceBuffer()},
std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
std::array<ck::index_t, 2>{StrideA, StrideA},
std::array<ck::index_t, 1>{StrideB},
std::array<ck::index_t, 1>{StrideD},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
if(!device_op.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
if(do_verification)
{
Tensor<CShuffleDataType> c_m_n({M, N});
Tensor<ADataType> a_m_k({M, K});
for(int m = 0; m < M; ++m)
{
for(int k = 0; k < K; ++k)
{
a_element_op(a_m_k(m, k), a0_m_k(m, k), a1_m_k(m, k));
}
}
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CShuffleDataType,
AccDataType,
PassThrough,
BElementOp,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
}
}
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
}
return 0;
}
...@@ -571,7 +571,7 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle ...@@ -571,7 +571,7 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle
using Argument = DeviceOp::Argument; using Argument = DeviceOp::Argument;
template <typename T> template <typename T>
using has_reduced_amex_scale = decltype(std::declval<T&>().reduced_amex_scale); using has_reduced_amex = decltype(std::declval<T&>().reduced_amax);
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{ {
...@@ -586,9 +586,11 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle ...@@ -586,9 +586,11 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle
float kern_time = 0; float kern_time = 0;
AElementwiseOperation a_element_op_ = arg.a_element_op_; auto a_element_op_ = arg.a_element_op_;
auto b_element_op_ = arg.b_element_op_;
auto cde_element_op_ = arg.cde_element_op_;
if constexpr(is_detected<has_reduced_amex_scale, AElementwiseOperation>::value) if constexpr(is_detected<has_reduced_amex, AElementwiseOperation>::value)
{ {
ADataType amax_a; ADataType amax_a;
...@@ -608,19 +610,18 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle ...@@ -608,19 +610,18 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
stream_config.stream_id_)); stream_config.stream_id_));
static_assert(is_same<decltype(arg.a_element_op_.reduced_amex_scale), float>::value, static_assert(is_same<decltype(arg.a_element_op_.reduced_amax), float>::value,
"scale is not float!"); "scale is not float!");
a_element_op_.reduced_amex_scale = 1.0 / amax_a; a_element_op_.reduced_amax = amax_a;
// std::cout << " amax_a: " << amax_a << std::endl; // std::cout << " amax_a: " << amax_a << std::endl;
cde_element_op_.a_reduced_amax = a_element_op_.reduced_amax;
} }
BElementwiseOperation b_element_op_ = arg.b_element_op_; if constexpr(is_detected<has_reduced_amex, BElementwiseOperation>::value)
if constexpr(is_detected<has_reduced_amex_scale, BElementwiseOperation>::value)
{ {
ADataType amax_b; BDataType amax_b;
auto reduce_b = Reduce2D<BDataType, BLayout>{}; auto reduce_b = Reduce2D<BDataType, BLayout>{};
kern_time += reduce_b.Run({arg.KRaw_, arg.NRaw_}, kern_time += reduce_b.Run({arg.KRaw_, arg.NRaw_},
...@@ -638,12 +639,13 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle ...@@ -638,12 +639,13 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
stream_config.stream_id_)); stream_config.stream_id_));
static_assert(is_same<decltype(arg.b_element_op_.reduced_amex_scale), float>::value, static_assert(is_same<decltype(arg.b_element_op_.reduced_amax), float>::value,
"scale is not float!"); "scale is not float!");
b_element_op_.reduced_amex_scale = 1.0 / amax_b; b_element_op_.reduced_amax = amax_b;
// std::cout << " amax_b: " << amax_b << std::endl; // std::cout << " amax_b: " << amax_b << std::endl;
cde_element_op_.b_reduced_amax = b_element_op_.reduced_amax;
} }
const index_t grid_size = const index_t grid_size =
...@@ -679,7 +681,7 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle ...@@ -679,7 +681,7 @@ struct DeviceGemmMultipleDScaleAB_Xdl_CShuffle
arg.p_e_grid_, arg.p_e_grid_,
a_element_op_, a_element_op_,
b_element_op_, b_element_op_,
arg.cde_element_op_, cde_element_op_,
arg.a_grid_desc_ak0_m_ak1_, arg.a_grid_desc_ak0_m_ak1_,
arg.b_grid_desc_bk0_n_bk1_, arg.b_grid_desc_bk0_n_bk1_,
arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_, arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment