Commit 66d93ae5 authored by rocking's avatar rocking
Browse files

Rename Reduce -> R

parent 63914743
......@@ -23,14 +23,14 @@ namespace instance {
using F32 = float;
using F16 = ck::half_t;
using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
using RPtrsGlobal = ck::Tuple<F32*, F32*>;
using Identity = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare;
using ReduceInElementOps = ck::Tuple<Identity, Square>;
using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
using DeviceGemmReduceNoOpPtr =
ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
ck::tensor_operation::device::DeviceGemmReducePtr<0, RPtrsGlobal::Size()>;
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&);
......@@ -55,7 +55,7 @@ namespace profiler {
template <typename ADataType,
typename BDataType,
typename CDataType,
typename ReduceDataType,
typename RDataType,
typename ALayout,
typename BLayout,
typename CLayout>
......@@ -95,16 +95,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
Tensor<CDataType> c_g_m_n_host_result(
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
Tensor<RDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
Tensor<RDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
Tensor<CDataType> c_g_m_n_device_result(
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
Tensor<RDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
Tensor<RDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
......@@ -159,7 +159,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
BElementOp,
CElementOp>;
using ReduceAccDataType = ReduceDataType;
using RAccDataType = RDataType;
auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
auto ref_invoker = ref_batched_gemm.MakeInvoker();
......@@ -173,22 +173,22 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
{
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
auto reduce0_acc = reduce0_op.GetIdentityValue<RAccDataType>();
auto reduce1_acc = reduce1_op.GetIdentityValue<RAccDataType>();
for(int n = 0; n < N; ++n)
{
ReduceAccDataType d0_val =
ck::type_convert<ReduceAccDataType>(c_g_m_n_host_result(batch, m, n));
ReduceAccDataType d1_val;
RAccDataType d0_val =
ck::type_convert<RAccDataType>(c_g_m_n_host_result(batch, m, n));
RAccDataType d1_val;
square(d1_val, d0_val);
reduce0_op(reduce0_acc, d0_val);
reduce1_op(reduce1_acc, d1_val);
}
d0_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce0_acc);
d1_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce1_acc);
d0_g_m_host_result(batch, m) = ck::type_convert<RDataType>(reduce0_acc);
d1_g_m_host_result(batch, m) = ck::type_convert<RDataType>(reduce1_acc);
}
}
}
......@@ -196,10 +196,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace());
DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpace());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
d0_g_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
d1_g_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce0_device_buf(sizeof(RDataType) * d0_g_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce1_device_buf(sizeof(RDataType) * d1_g_m_device_result.mDesc.GetElementSpace());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
reduce1_device_buf.GetDeviceBuffer()};
......
......@@ -23,7 +23,7 @@ namespace instance {
using F32 = float;
using F16 = ck::half_t;
using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
using RPtrsGlobal = ck::Tuple<F32*, F32*>;
using Div = ck::tensor_operation::element_wise::UnaryDivide;
using Identity = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare;
......@@ -31,7 +31,7 @@ using ReduceInElementOps = ck::Tuple<Identity, Square>;
using ReduceOutElementOps = ck::Tuple<Div, Div>;
using DeviceGemmBiasAddReduceNoOpPtr =
ck::tensor_operation::device::DeviceGemmReducePtr<1, ReducePtrsGlobal::Size()>;
ck::tensor_operation::device::DeviceGemmReducePtr<1, RPtrsGlobal::Size()>;
void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
......@@ -58,7 +58,7 @@ template <typename ADataType,
typename CDataType,
typename BiasDataType,
typename D0DataType,
typename ReduceDataType,
typename RDataType,
typename ALayout,
typename BLayout,
typename CLayout>
......@@ -99,15 +99,15 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result(
Tensor<RDataType> reduce0_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_host_result(
Tensor<RDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result(
Tensor<RDataType> reduce0_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_device_result(
Tensor<RDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
......@@ -166,12 +166,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
ReduceDataType,
RDataType,
AElementOp,
BElementOp,
CElementOp>;
using ReduceAccDataType = ReduceDataType;
using RAccDataType = RDataType;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
......@@ -184,10 +184,10 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
for(int m = 0; m < M; ++m)
for(int n = 0; n < N; ++n)
{
ReduceAccDataType acc = static_cast<ReduceAccDataType>(c_m_n_host_result(m, n)) +
static_cast<ReduceAccDataType>(bias_n(n));
RAccDataType acc = static_cast<RAccDataType>(c_m_n_host_result(m, n)) +
static_cast<RAccDataType>(bias_n(n));
ReduceAccDataType d0 = static_cast<ReduceAccDataType>(d0_m_n(m, n));
RAccDataType d0 = static_cast<RAccDataType>(d0_m_n(m, n));
c_element_op(acc, acc);
d0_element_op(d0, d0);
acc += d0;
......@@ -196,14 +196,13 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
auto reduce0_acc = reduce0_op.GetIdentityValue<RAccDataType>();
auto reduce1_acc = reduce1_op.GetIdentityValue<RAccDataType>();
for(int n = 0; n < N; ++n)
{
ReduceAccDataType d0_val =
ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
ReduceAccDataType d1_val;
RAccDataType d0_val = ck::type_convert<RAccDataType>(c_m_n_host_result(m, n));
RAccDataType d1_val;
square(d1_val, d0_val);
reduce0_op(reduce0_acc, d0_val);
......@@ -212,8 +211,8 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
div(reduce0_acc, reduce0_acc);
div(reduce1_acc, reduce1_acc);
reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
reduce0_m_host_result(m) = ck::type_convert<RDataType>(reduce0_acc);
reduce1_m_host_result(m) = ck::type_convert<RDataType>(reduce1_acc);
}
}
......@@ -222,9 +221,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpace());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
DeviceMem reduce0_device_buf(sizeof(RDataType) *
reduce0_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
DeviceMem reduce1_device_buf(sizeof(RDataType) *
reduce1_m_device_result.mDesc.GetElementSpace());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
......@@ -323,8 +322,8 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
sizeof(D0DataType) * M * N + sizeof(ReduceDataType) * M +
sizeof(ReduceDataType) * M;
sizeof(D0DataType) * M * N + sizeof(RDataType) * M +
sizeof(RDataType) * M;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
......
......@@ -23,7 +23,7 @@ namespace instance {
using F32 = float;
using F16 = ck::half_t;
using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
using RPtrsGlobal = ck::Tuple<F32*, F32*>;
using Div = ck::tensor_operation::element_wise::UnaryDivide;
using Identity = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare;
......@@ -31,7 +31,7 @@ using ReduceInElementOps = ck::Tuple<Identity, Square>;
using ReduceOutElementOps = ck::Tuple<Div, Div>;
using DeviceGemmReduceNoOpPtr =
ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
ck::tensor_operation::device::DeviceGemmReducePtr<0, RPtrsGlobal::Size()>;
void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&);
......@@ -56,7 +56,7 @@ namespace profiler {
template <typename ADataType,
typename BDataType,
typename CDataType,
typename ReduceDataType,
typename RDataType,
typename ALayout,
typename BLayout,
typename CLayout>
......@@ -91,15 +91,15 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result(
Tensor<RDataType> reduce0_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_host_result(
Tensor<RDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result(
Tensor<RDataType> reduce0_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_device_result(
Tensor<RDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
......@@ -151,12 +151,12 @@ bool profile_gemm_reduce_impl(int do_verification,
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
ReduceDataType,
RDataType,
AElementOp,
BElementOp,
CElementOp>;
using ReduceAccDataType = ReduceDataType;
using RAccDataType = RDataType;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
......@@ -168,14 +168,13 @@ bool profile_gemm_reduce_impl(int do_verification,
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
auto reduce0_acc = reduce0_op.GetIdentityValue<RAccDataType>();
auto reduce1_acc = reduce1_op.GetIdentityValue<RAccDataType>();
for(int n = 0; n < N; ++n)
{
ReduceAccDataType d0_val =
ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
ReduceAccDataType d1_val;
RAccDataType d0_val = ck::type_convert<RAccDataType>(c_m_n_host_result(m, n));
RAccDataType d1_val;
square(d1_val, d0_val);
reduce0_op(reduce0_acc, d0_val);
......@@ -184,17 +183,17 @@ bool profile_gemm_reduce_impl(int do_verification,
div(reduce0_acc, reduce0_acc);
div(reduce1_acc, reduce1_acc);
reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
reduce0_m_host_result(m) = ck::type_convert<RDataType>(reduce0_acc);
reduce1_m_host_result(m) = ck::type_convert<RDataType>(reduce1_acc);
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
DeviceMem reduce0_device_buf(sizeof(RDataType) *
reduce0_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
DeviceMem reduce1_device_buf(sizeof(RDataType) *
reduce1_m_device_result.mDesc.GetElementSpace());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment