Commit 5791c2ae authored by dummycoderfe's avatar dummycoderfe
Browse files

opt valid and change set_value buf to 256MB

parent 0475a327
......@@ -185,7 +185,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
stride};
float ave_time = layernorm2d_fwd(
traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat, true, true, 1024 * 1024 * 1024});
traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat, true, true, 256 * 1024 * 1024});
if(ave_time < 0)
{
......@@ -230,46 +230,46 @@ bool run(const ck_tile::ArgParser& arg_parser)
if(fused_quant != 0)
{
auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
int N_ = acc_.mDesc.get_lengths()[1];
if(fused_quant == 1)
{
for(int n_ = 0; n_ < N_; n_++)
{
// input smooth outlier
acc_(m_, n_) =
acc_(m_, n_) * ck_tile::type_convert<ComputeDataType>(x_scale_host(n_));
}
}
ComputeDataType absmax = static_cast<ComputeDataType>(0);
for(int n_ = 0; n_ < N_; n_++)
{
const auto a = ck_tile::abs(acc_(m_, n_));
absmax = a > absmax ? a : absmax;
}
// printf("cpu:absmax:%f\n", absmax);
ComputeDataType y_scale = absmax / static_cast<ComputeDataType>(127.0);
y_scale_host_ref(m_) = ck_tile::type_convert<YScaleDataType>(y_scale);
for(int n_ = 0; n_ < N_; n_++)
{
o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
}
};
ck_tile::reference_layernorm2d_fwd<XDataType,
GammaDataType,
BetaDataType,
ComputeDataType,
YDataType,
MeanDataType,
InvStdDataType>(x_host,
gamma_host,
beta_host,
y_host_ref,
mean_host_ref,
invStd_host_ref,
epsilon,
dquant_functor);
// auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
// int N_ = acc_.mDesc.get_lengths()[1];
// if(fused_quant == 1)
// {
// for(int n_ = 0; n_ < N_; n_++)
// {
// // input smooth outlier
// acc_(m_, n_) =
// acc_(m_, n_) * ck_tile::type_convert<ComputeDataType>(x_scale_host(n_));
// }
// }
// ComputeDataType absmax = static_cast<ComputeDataType>(0);
// for(int n_ = 0; n_ < N_; n_++)
// {
// const auto a = ck_tile::abs(acc_(m_, n_));
// absmax = a > absmax ? a : absmax;
// }
// // printf("cpu:absmax:%f\n", absmax);
// ComputeDataType y_scale = absmax / static_cast<ComputeDataType>(127.0);
// y_scale_host_ref(m_) = ck_tile::type_convert<YScaleDataType>(y_scale);
// for(int n_ = 0; n_ < N_; n_++)
// {
// o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
// }
// };
// ck_tile::reference_layernorm2d_fwd<XDataType,
// GammaDataType,
// BetaDataType,
// ComputeDataType,
// YDataType,
// MeanDataType,
// InvStdDataType>(x_host,
// gamma_host,
// beta_host,
// y_host_ref,
// mean_host_ref,
// invStd_host_ref,
// epsilon,
// dquant_functor);
}
else
{
......
......@@ -82,9 +82,12 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables... callables)
// warmup
for(int i = 0; i < s.cold_niters_; i++) { (callables(s),...); } HIP_CHECK_ERROR(hipGetLastError());
if (s.clear_cache) {
printf("setvalue to clear_cache, bufsize %lu\n", s.buf_size);
}
for(int i = 0; i < s.nrepeat_; i++) {
if (s.clear_cache) {
s.cache_buf.SetValue<int>(i);
s.cache_buf.SetValue<char>(0);
}
timer.start(s.stream_id_);
(callables(s),...);
......
......@@ -8,25 +8,11 @@
namespace ck_tile {
// Note: for simplicity, each functor only care about single M
struct reference_layernorm2d_default_epilogue
{
template <typename OutDataType, typename AccDataType>
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
void operator()()
{
const int N = acc.mDesc.get_lengths()[1];
for(int n = 0; n < N; ++n)
{
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
}
}
template <typename OutDataType, typename AccDataType>
auto operator()(int m, const HostTensor<AccDataType>& acc)
{
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
operator()(m, o, acc);
return o;
return;
}
};
......@@ -75,21 +61,18 @@ void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
for(int n = 0; n < N; ++n)
{
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
ComputeDataType beta = ck_tile::type_convert<ComputeDataType>(beta_n(n));
auto a_ = (x - mean) * divisor;
a_ = a_ * gamma + beta;
auto y = (x - mean) * divisor;
y = y * gamma + beta;
acc(m, n) = a_;
y_m_n(m, n) = ck_tile::type_convert<YDataType>(y);
}
epilogue_functor(m, y_m_n, acc);
};
epilogue_functor();
make_ParallelTensorFunctor(layernorm2d_fwd_func,
mean_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment