Commit 4899c20f authored by Chao Liu's avatar Chao Liu
Browse files

upadte profiler

parent 5729c23c
...@@ -17,17 +17,25 @@ namespace tensor_operation { ...@@ -17,17 +17,25 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_layernorm_f16_rank2_instances( // FP16
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 2, 1>>&); void add_device_layernorm_rank_2_1_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 2, 1>>>&);
void add_device_layernorm_f16_rank4_instances( void add_device_layernorm_rank_4_3_f16_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 4, 3>>&); std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 4, 3>>>&);
void add_device_layernorm_f32_rank2_instances( void add_device_layernorm_rank_5_3_f16_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 2, 1>>&); std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 5, 3>>>&);
void add_device_layernorm_f32_rank4_instances( // FP32
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 4, 3>>&); void add_device_layernorm_rank_2_1_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 2, 1>>>&);
void add_device_layernorm_rank_4_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
void add_device_layernorm_rank_5_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 5, 3>>>&);
template <typename XDataType, template <typename XDataType,
typename GammaDataType, typename GammaDataType,
...@@ -62,17 +70,33 @@ struct DeviceOperationInstanceFactory< ...@@ -62,17 +70,33 @@ struct DeviceOperationInstanceFactory<
is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>) is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>)
{ {
if constexpr(Rank == 2 && NumReduceDim == 1) if constexpr(Rank == 2 && NumReduceDim == 1)
add_device_layernorm_f16_rank2_instances(op_ptrs); {
add_device_layernorm_rank_2_1_f16_instances(op_ptrs);
}
else if constexpr(Rank == 4 && NumReduceDim == 3) else if constexpr(Rank == 4 && NumReduceDim == 3)
add_device_layernorm_f16_rank4_instances(op_ptrs); {
add_device_layernorm_rank_4_3_f16_instances(op_ptrs);
}
else if constexpr(Rank == 5 && NumReduceDim == 3)
{
add_device_layernorm_rank_5_3_f16_instances(op_ptrs);
}
} }
else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> && else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>) is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>)
{ {
if constexpr(Rank == 2 && NumReduceDim == 1) if constexpr(Rank == 2 && NumReduceDim == 1)
add_device_layernorm_f32_rank2_instances(op_ptrs); {
add_device_layernorm_rank_2_1_f32_instances(op_ptrs);
}
else if constexpr(Rank == 4 && NumReduceDim == 3) else if constexpr(Rank == 4 && NumReduceDim == 3)
add_device_layernorm_f32_rank4_instances(op_ptrs); {
add_device_layernorm_rank_4_3_f32_instances(op_ptrs);
}
else if constexpr(Rank == 5 && NumReduceDim == 3)
{
add_device_layernorm_rank_5_3_f32_instances(op_ptrs);
}
} }
return op_ptrs; return op_ptrs;
......
...@@ -15,8 +15,7 @@ namespace instance { ...@@ -15,8 +15,7 @@ namespace instance {
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough; using Pass = ck::tensor_operation::element_wise::PassThrough;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
template <typename OutElementwise, index_t Rank, index_t Reduce> template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_layernorm_f16_instances = std::tuple< using device_layernorm_f16_instances = std::tuple<
...@@ -36,22 +35,22 @@ using device_layernorm_f16_instances = std::tuple< ...@@ -36,22 +35,22 @@ using device_layernorm_f16_instances = std::tuple<
// clang-format on // clang-format on
>; >;
void add_device_layernorm_f16_rank2_instances( void add_device_layernorm_rank_2_1_f16_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Pass, 2, 1>>& instances) std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 2, 1>>>& instances)
{ {
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 2, 1>{}); add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 2, 1>{});
} }
void add_device_layernorm_f16_rank4_instances( void add_device_layernorm_rank_4_3_f16_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Pass, 4, 3>>& instances) std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 4, 3>>>& instances)
{ {
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 4, 3>{}); add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 4, 3>{});
} }
void add_device_groupnorm_f16_instances( void add_device_layernorm_rank_5_3_f16_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Sigmoid, 5, 3>>& instances) std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 5, 3>>>& instances)
{ {
add_device_operation_instances(instances, device_layernorm_f16_instances<Sigmoid, 5, 3>{}); add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 5, 3>{});
} }
} // namespace instance } // namespace instance
......
...@@ -14,8 +14,7 @@ namespace instance { ...@@ -14,8 +14,7 @@ namespace instance {
using F32 = float; using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough; using Pass = ck::tensor_operation::element_wise::PassThrough;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
template <typename OutElementwise, index_t Rank, index_t Reduce> template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_layernorm_f32_instances = std::tuple< using device_layernorm_f32_instances = std::tuple<
...@@ -34,22 +33,22 @@ using device_layernorm_f32_instances = std::tuple< ...@@ -34,22 +33,22 @@ using device_layernorm_f32_instances = std::tuple<
// clang-format on // clang-format on
>; >;
void add_device_layernorm_f32_rank2_instances( void add_device_layernorm_rank_2_1_f32_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Pass, 2, 1>>& instances) std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 2, 1>>>& instances)
{ {
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 2, 1>{}); add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 2, 1>{});
} }
void add_device_layernorm_f32_rank4_instances( void add_device_layernorm_rank_4_3_f32_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Pass, 4, 3>>& instances) std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 4, 3>>>& instances)
{ {
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 4, 3>{}); add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 4, 3>{});
} }
void add_device_groupnorm_f32_instances( void add_device_layernorm_rank_5_3_f32_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Sigmoid, 5, 3>>& instances) std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 5, 3>>>& instances)
{ {
add_device_operation_instances(instances, device_layernorm_f32_instances<Sigmoid, 5, 3>{}); add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 5, 3>{});
} }
} // namespace instance } // namespace instance
......
...@@ -56,4 +56,3 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) ...@@ -56,4 +56,3 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_normalization_instance) target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" #include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -15,35 +15,9 @@ ...@@ -15,35 +15,9 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
void add_device_groupnorm_f16_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Sigmoid, 5, 3>>&);
void add_device_groupnorm_f32_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Sigmoid, 5, 3>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck { namespace ck {
namespace profiler { namespace profiler {
enum struct ElementwiseOpEnum
{
ePassthrough = 0,
eSigmoid = 1
};
template <typename XDataType, template <typename XDataType,
typename GammaDataType, typename GammaDataType,
typename BetaDataType, typename BetaDataType,
...@@ -53,12 +27,9 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -53,12 +27,9 @@ bool profile_groupnorm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
std::vector<index_t> length, std::vector<index_t> length)
ElementwiseOpEnum OutelementwiseOp)
{ {
using F16 = ck::half_t; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using F32 = float;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
if(length.size() != 5) if(length.size() != 5)
return false; return false;
...@@ -104,35 +75,21 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -104,35 +75,21 @@ bool profile_groupnorm_impl(int do_verification,
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.mData.data());
// add device normalization instances // add device normalization instances
std::vector<tensor_operation::device::DeviceLayernormPtr<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
Sigmoid, PassThrough,
5, 5,
3>> 3>;
instances;
// get device op instances
if constexpr(is_same<XDataType, F16>::value && is_same<GammaDataType, F16>::value && const auto instance_ptrs =
is_same<BetaDataType, F16>::value && is_same<YDataType, F16>::value && ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
is_same<AccDataType, F32>::value) DeviceOp>::GetInstances();
{
if(OutelementwiseOp == ElementwiseOpEnum::eSigmoid) std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
tensor_operation::device::instance::add_device_groupnorm_f16_instances(instances);
}
else if constexpr(is_same<XDataType, F32>::value && is_same<GammaDataType, F32>::value &&
is_same<BetaDataType, F32>::value && is_same<YDataType, F32>::value &&
is_same<AccDataType, F32>::value)
{
if(OutelementwiseOp == ElementwiseOpEnum::eSigmoid)
tensor_operation::device::instance::add_device_groupnorm_f32_instances(instances);
}
if(instances.size() <= 0)
{
throw std::runtime_error("wrong! no device normalization instance found");
}
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
...@@ -140,25 +97,22 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -140,25 +97,22 @@ bool profile_groupnorm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
if(OutelementwiseOp == ElementwiseOpEnum::eSigmoid) using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
{ GammaDataType,
using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType, BetaDataType,
GammaDataType, YDataType,
BetaDataType, AccDataType,
YDataType, PassThrough>;
AccDataType,
Sigmoid>; ReferenceInstance ref;
auto ref_argument = ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, 1e-6);
ReferenceInstance ref; auto ref_invoker = ref.MakeInvoker();
auto ref_argument = ref.MakeArgument(x, gamma, beta, host_y, Sigmoid{}, length, 1e-6); ref_invoker.Run(ref_argument);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);
}
} }
int num_kernel = 0; int num_kernel = 0;
for(auto& inst_ptr : instances) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = inst_ptr->MakeArgumentPointer(
length, length,
...@@ -172,7 +126,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -172,7 +126,7 @@ bool profile_groupnorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(), beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(), y_dev.GetDeviceBuffer(),
Sigmoid{}); PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
......
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" #include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -15,26 +15,6 @@ ...@@ -15,26 +15,6 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
void add_device_layernorm_f16_rank2_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 2, 1>>&);
void add_device_layernorm_f32_rank2_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 2, 1>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification, ...@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
std::vector<index_t> strideGamma, std::vector<index_t> strideGamma,
std::vector<index_t> strideBeta) std::vector<index_t> strideBeta)
{ {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() < 2) if(length.size() < 2)
...@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification, ...@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
gamma_dev.ToDevice(gamma.mData.data()); gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.mData.data());
// add device normalization instances
constexpr int NumReduceDim = Rank - 1; constexpr int NumReduceDim = Rank - 1;
std::vector<tensor_operation::device::DeviceLayernormPtr<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
PassThrough,
Rank,
NumReduceDim>>
instances;
if constexpr(is_same<XDataType, F16>::value && is_same<GammaDataType, F16>::value &&
is_same<BetaDataType, F16>::value && is_same<YDataType, F16>::value &&
is_same<AccDataType, F32>::value)
{
if(length.size() == 2)
tensor_operation::device::instance::add_device_layernorm_f16_rank2_instances(instances);
}
else if constexpr(is_same<XDataType, F32>::value && is_same<GammaDataType, F32>::value &&
is_same<BetaDataType, F32>::value && is_same<YDataType, F32>::value &&
is_same<AccDataType, F32>::value)
{
if(length.size() == 2)
tensor_operation::device::instance::add_device_layernorm_f32_rank2_instances(instances);
}
if(instances.size() <= 0) // add device normalization instances
{ using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
throw std::runtime_error("wrong! no device normalization instance found"); GammaDataType,
} BetaDataType,
AccDataType,
YDataType,
PassThrough,
Rank,
NumReduceDim>;
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
...@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification, ...@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
for(auto& inst_ptr : instances) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer(length, auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
strideXY, strideXY,
...@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification, ...@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
if(!inst_ptr->IsSupportedArgument(argument_ptr.get())) if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = [", length, "], ") << std::endl; LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
return; continue;
} }
auto invoker_ptr = inst_ptr->MakeInvokerPointer(); auto invoker_ptr = inst_ptr->MakeInvokerPointer();
......
...@@ -5,10 +5,10 @@ ...@@ -5,10 +5,10 @@
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_groupnorm_impl.hpp" #include "profiler/include/profile_groupnorm_impl.hpp"
using ck::index_t; using ck::index_t;
using ck::profiler::ElementwiseOpEnum;
struct GroupnormArgParser struct GroupnormArgParser
{ {
...@@ -50,23 +50,21 @@ void print_help_groupnorm() ...@@ -50,23 +50,21 @@ void print_help_groupnorm()
<< "arg3: verification (0: no; 1: yes)\n" << "arg3: verification (0: no; 1: yes)\n"
<< "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg5: print tensor value (0: no; 1: yes)\n" << "arg5: print tensor value (0: no; 1: yes)\n"
<< "arg6: time kernel (0=n0, 1=yes)\n" << "arg6: time kernel (0=no, 1=yes)\n"
<< "arg7: out elementwise op (0=passthrough, 1=sigmoid)\n"
<< "--length: tensor extents (e.g, --length 1 16 16 32 40) \n" << "--length: tensor extents (e.g, --length 1 16 16 32 40) \n"
<< std::endl; << std::endl;
} }
int profile_groupnorm(int argc, char* argv[]) int profile_groupnorm(int argc, char* argv[])
{ {
ck::DataTypeEnum data_type = ck::DataTypeEnum::Half; ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
bool do_verification = false; bool do_verification = false;
int init_method = 0; int init_method = 0;
bool do_log = 0; bool do_log = 0;
bool time_kernel = 1; bool time_kernel = 1;
ElementwiseOpEnum outElementwiseOp = ElementwiseOpEnum::eSigmoid; std::vector<index_t> length = {64, 16, 16, 32, 40};
std::vector<index_t> length = {1, 16, 16, 32, 40};
if(argc != 1 && argc != 14) if(argc != 1 && argc != 13)
{ {
print_help_groupnorm(); print_help_groupnorm();
return 0; return 0;
...@@ -74,12 +72,11 @@ int profile_groupnorm(int argc, char* argv[]) ...@@ -74,12 +72,11 @@ int profile_groupnorm(int argc, char* argv[])
if(argc == 14) if(argc == 14)
{ {
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2])); data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
do_verification = std::stoi(argv[3]); do_verification = std::stoi(argv[3]);
init_method = std::stoi(argv[4]); init_method = std::stoi(argv[4]);
do_log = std::stoi(argv[5]); do_log = std::stoi(argv[5]);
time_kernel = std::stoi(argv[6]); time_kernel = std::stoi(argv[6]);
outElementwiseOp = static_cast<ElementwiseOpEnum>(std::stoi(argv[7]));
// parse the long options // parse the long options
GroupnormArgParser arg_parser; GroupnormArgParser arg_parser;
...@@ -90,10 +87,15 @@ int profile_groupnorm(int argc, char* argv[]) ...@@ -90,10 +87,15 @@ int profile_groupnorm(int argc, char* argv[])
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
if(data_type == ck::DataTypeEnum::Half && outElementwiseOp == ElementwiseOpEnum::eSigmoid) if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_groupnorm_impl<F32, F32, F32, F32, F32>(
do_verification, init_method, do_log, time_kernel, length);
}
else if(data_type == ck::DataTypeEnum::Half)
{ {
ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16>( ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16>(
do_verification, init_method, do_log, time_kernel, length, outElementwiseOp); do_verification, init_method, do_log, time_kernel, length);
} }
else else
{ {
...@@ -102,10 +104,3 @@ int profile_groupnorm(int argc, char* argv[]) ...@@ -102,10 +104,3 @@ int profile_groupnorm(int argc, char* argv[])
return 0; return 0;
} }
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_groupnorm(argc, argv);
// return 0;
// }
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_layernorm_impl.hpp" #include "profiler/include/profile_layernorm_impl.hpp"
using ck::index_t; using ck::index_t;
...@@ -49,7 +50,7 @@ void print_help_layernorm() ...@@ -49,7 +50,7 @@ void print_help_layernorm()
<< "arg2: verification (0: no; 1: yes)\n" << "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n" << "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=n0, 1=yes)\n" << "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 1024 1024) \n" << "--length: tensor extents (e.g, --length 1024 1024) \n"
<< "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n" << "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n"
<< "--strideGamma: tensor strides (e.g, --strideGamma 1)\n" << "--strideGamma: tensor strides (e.g, --strideGamma 1)\n"
...@@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[]) ...@@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[])
return 0; return 0;
} }
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_layernorm(argc, argv);
// return 0;
// }
...@@ -3,26 +3,27 @@ ...@@ -3,26 +3,27 @@
#include <cstring> #include <cstring>
int profile_gemm(int, char*[]); // int profile_gemm(int, char*[]);
int profile_gemm_splitk(int, char*[]); // int profile_gemm_splitk(int, char*[]);
int profile_gemm_bilinear(int, char*[]); // int profile_gemm_bilinear(int, char*[]);
int profile_gemm_add_add_fastgelu(int, char*[]); // int profile_gemm_add_add_fastgelu(int, char*[]);
int profile_gemm_reduce(int, char*[]); // int profile_gemm_reduce(int, char*[]);
int profile_gemm_bias_add_reduce(int, char*[]); // int profile_gemm_bias_add_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]); // int profile_batched_gemm(int, char*[]);
int profile_batched_gemm_gemm(int, char*[]); // int profile_batched_gemm_gemm(int, char*[]);
int profile_batched_gemm_add_relu_gemm_add(int, char*[]); // int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
int profile_batched_gemm_reduce(int, char*[]); // int profile_batched_gemm_reduce(int, char*[]);
int profile_grouped_gemm(int, char*[]); // int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]); // int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); // int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); // int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_bwd_data(int, char*[]); // int profile_conv_bwd_data(int, char*[]);
int profile_conv_bwd_weight(int, char*[]); // int profile_conv_bwd_weight(int, char*[]);
int profile_grouped_conv_fwd(int, char*[]); // int profile_grouped_conv_fwd(int, char*[]);
int profile_normalization(int, char*[]); // int profile_normalization(int, char*[]);
int profile_layernorm(int, char*[]); int profile_layernorm(int, char*[]);
int profile_reduce(int, char*[]); int profile_groupnorm(int, char*[]);
// int profile_reduce(int, char*[]);
static void print_helper_message() static void print_helper_message()
{ {
...@@ -56,6 +57,7 @@ int main(int argc, char* argv[]) ...@@ -56,6 +57,7 @@ int main(int argc, char* argv[])
return 0; return 0;
} }
#if 0
else if(strcmp(argv[1], "gemm") == 0) else if(strcmp(argv[1], "gemm") == 0)
{ {
return profile_gemm(argc, argv); return profile_gemm(argc, argv);
...@@ -132,6 +134,7 @@ int main(int argc, char* argv[]) ...@@ -132,6 +134,7 @@ int main(int argc, char* argv[])
{ {
return profile_normalization(argc, argv); return profile_normalization(argc, argv);
} }
#endif
else if(strcmp(argv[1], "layernorm") == 0) else if(strcmp(argv[1], "layernorm") == 0)
{ {
return profile_layernorm(argc, argv); return profile_layernorm(argc, argv);
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
using ck::index_t; using ck::index_t;
using ck::profiler::ElementwiseOpEnum;
template <typename Tuple> template <typename Tuple>
class TestGroupnorm : public ::testing::Test class TestGroupnorm : public ::testing::Test
...@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test ...@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
for(auto length : lengths) for(auto length : lengths)
{ {
bool success = ck::profiler::profile_groupnorm_impl<XDataType, bool success =
GammaDataType, ck::profiler::profile_groupnorm_impl<XDataType,
BetaDataType, GammaDataType,
AccDataType, BetaDataType,
YDataType>( AccDataType,
true, 2, false, false, length, ElementwiseOpEnum::eSigmoid); YDataType>(true, 2, false, false, length);
EXPECT_TRUE(success); EXPECT_TRUE(success);
} }
} }
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
using ck::index_t; using ck::index_t;
using ck::profiler::ElementwiseOpEnum;
template <typename Tuple> template <typename Tuple>
class TestGroupnorm : public ::testing::Test class TestGroupnorm : public ::testing::Test
...@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test ...@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
for(auto length : lengths) for(auto length : lengths)
{ {
bool success = ck::profiler::profile_groupnorm_impl<XDataType, bool success =
GammaDataType, ck::profiler::profile_groupnorm_impl<XDataType,
BetaDataType, GammaDataType,
AccDataType, BetaDataType,
YDataType>( AccDataType,
true, 2, false, false, length, ElementwiseOpEnum::eSigmoid); YDataType>(true, 2, false, false, length);
EXPECT_TRUE(success); EXPECT_TRUE(success);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment