"include/vscode:/vscode.git/clone" did not exist on "c6891e127523d35726159530d7cfbb636d7461b3"
Commit e00a943e authored by myamlak's avatar myamlak
Browse files

Merge remote-tracking branch 'origin/develop' into myamlak/cgemm

parents ffe12e2e 9f71ff48
...@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE ...@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp; device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
) )
add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE}) add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
target_compile_features(device_convnd_bwd_data_instance PUBLIC) target_compile_features(device_convnd_bwd_data_instance PUBLIC)
set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib) install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib)
......
#include <stdlib.h>
#include "config.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
#include "host_interface.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_conv2d_fwd_instance {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
} // namespace device_conv2d_fwd_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl
{
std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
MakeArgumentPointer(void* in_ptr,
void* wei_ptr,
void* out_ptr,
size_t N,
size_t K,
size_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads) const
{
return el->MakeArgumentPointer(in_ptr,
wei_ptr,
out_ptr,
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
}
std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> MakeInvokerPointer() const
{
return el->MakeInvokerPointer();
}
std::string GetTypeString() { return el->GetTypeString(); }
bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg)
{
return el->IsSupportedArgument(arg);
}
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough> el;
};
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {}
DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t() = default;
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default;
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other)
: pImpl(std::make_unique<DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl>(std::move(other)))
{
}
std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr,
void* wei_ptr,
void* out_ptr,
size_t N,
size_t K,
size_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads) const
{
return pImpl->MakeArgumentPointer(in_ptr,
wei_ptr,
out_ptr,
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
}
std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> DeviceConvFwdPtr_t::MakeInvokerPointer() const
{
return pImpl->MakeInvokerPointer();
}
std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); }
bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr)
{
return pImpl->IsSupportedArgument(arg_ptr);
}
using namespace ck::tensor_operation::device::device_conv2d_fwd_instance;
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances)
{
std::vector<
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
local_instances;
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances);
for(auto& kinder : local_instances)
{
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
instances.emplace_back(tmp);
}
return;
}
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances)
{
std::vector<
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
local_instances;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances);
for(auto& kinder : local_instances)
{
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
instances.emplace_back(tmp); // Perhaps we can do better
}
return;
}
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances)
{
std::vector<
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
local_instances;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances);
for(auto& kinder : local_instances)
{
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
instances.emplace_back(tmp); // Perhaps we can do better
}
return;
}
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances)
{
std::vector<
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
local_instances;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances);
for(auto& kinder : local_instances)
{
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
instances.emplace_back(tmp); // Perhaps we can do better
}
return;
}
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances)
{
std::vector<
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
local_instances;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances);
for(auto& kinder : local_instances)
{
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
instances.emplace_back(tmp);
}
return;
}
...@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE ...@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp; device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
) )
add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE}) add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE})
target_compile_features(device_gemm_instance PUBLIC) target_compile_features(device_gemm_instance PUBLIC)
set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_instance) clang_tidy_check(device_gemm_instance)
...@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE ...@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp; device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
) )
add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE}) add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
target_compile_features(device_gemm_bias2d_instance PUBLIC)
set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_bias2d_instance) clang_tidy_check(device_gemm_bias2d_instance)
...@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE ...@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp; device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
) )
add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
target_compile_features(device_gemm_bias_relu_instance PUBLIC)
set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_bias_relu_instance) clang_tidy_check(device_gemm_bias_relu_instance)
...@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE ...@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp; device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
) )
add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
target_compile_features(device_gemm_bias_relu_add_instance PUBLIC)
set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_bias_relu_add_instance) clang_tidy_check(device_gemm_bias_relu_add_instance)
...@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE ...@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp; device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
) )
add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE})
target_compile_features(device_grouped_gemm_instance PUBLIC) target_compile_features(device_grouped_gemm_instance PUBLIC)
set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
......
...@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE ...@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp; device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
) )
add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE})
target_compile_features(device_reduce_instance PUBLIC)
set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_reduce_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_reduce_instance) clang_tidy_check(device_reduce_instance)
...@@ -8,14 +8,14 @@ include_directories(BEFORE ...@@ -8,14 +8,14 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/library/include/ck/library/utility ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
) )
set(CONV_FWD_UTIL_SOURCE set(CONV_UTIL_SOURCE
conv_fwd_util.cpp conv_util.cpp
) )
add_library(conv_fwd_util SHARED ${CONV_FWD_UTIL_SOURCE}) add_library(conv_util SHARED ${CONV_UTIL_SOURCE})
target_link_libraries(conv_fwd_util PRIVATE host_tensor) target_link_libraries(conv_util PRIVATE host_tensor)
target_compile_features(conv_fwd_util PUBLIC) target_compile_features(conv_util PUBLIC)
set_target_properties(conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(conv_util PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(conv_fwd_util SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>) target_include_directories(conv_util SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
clang_tidy_check(conv_fwd_util) clang_tidy_check(conv_util)
#include "conv_fwd_util.hpp" #include "conv_util.hpp"
namespace ck { namespace ck {
namespace utils { namespace utils {
...@@ -37,16 +37,16 @@ std::size_t get_flops(ck::index_t N, ...@@ -37,16 +37,16 @@ std::size_t get_flops(ck::index_t N,
} }
ConvParams::ConvParams() ConvParams::ConvParams()
: num_dim_spatial(2), : num_dim_spatial_(2),
N(128), N_(128),
K(256), K_(256),
C(192), C_(192),
filter_spatial_lengths(2, 3), filter_spatial_lengths_(2, 3),
input_spatial_lengths(2, 71), input_spatial_lengths_(2, 71),
conv_filter_strides(2, 2), conv_filter_strides_(2, 2),
conv_filter_dilations(2, 1), conv_filter_dilations_(2, 1),
input_left_pads(2, 1), input_left_pads_(2, 1),
input_right_pads(2, 1) input_right_pads_(2, 1)
{ {
} }
...@@ -60,23 +60,23 @@ ConvParams::ConvParams(ck::index_t n_dim, ...@@ -60,23 +60,23 @@ ConvParams::ConvParams(ck::index_t n_dim,
const std::vector<ck::index_t>& dilations, const std::vector<ck::index_t>& dilations,
const std::vector<ck::index_t>& left_pads, const std::vector<ck::index_t>& left_pads,
const std::vector<ck::index_t>& right_pads) const std::vector<ck::index_t>& right_pads)
: num_dim_spatial(n_dim), : num_dim_spatial_(n_dim),
N(n_batch), N_(n_batch),
K(n_out_channels), K_(n_out_channels),
C(n_in_channels), C_(n_in_channels),
filter_spatial_lengths(filters_len), filter_spatial_lengths_(filters_len),
input_spatial_lengths(input_len), input_spatial_lengths_(input_len),
conv_filter_strides(strides), conv_filter_strides_(strides),
conv_filter_dilations(dilations), conv_filter_dilations_(dilations),
input_left_pads(left_pads), input_left_pads_(left_pads),
input_right_pads(right_pads) input_right_pads_(right_pads)
{ {
if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial || if(ck::type_convert<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial) ck::type_convert<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
{ {
throw( throw(
std::runtime_error("ConvParams::GetOutputSpatialLengths: " std::runtime_error("ConvParams::GetOutputSpatialLengths: "
...@@ -86,27 +86,28 @@ ConvParams::ConvParams(ck::index_t n_dim, ...@@ -86,27 +86,28 @@ ConvParams::ConvParams(ck::index_t n_dim,
std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const
{ {
if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial || if(ck::type_convert<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial || ck::type_convert<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial) ck::type_convert<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
{ {
throw( throw(
std::runtime_error("ConvParams::GetOutputSpatialLengths: " std::runtime_error("ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!")); "parameter size is different from number of declared dimensions!"));
} }
std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0); std::vector<ck::index_t> out_spatial_len(num_dim_spatial_, 0);
for(ck::index_t i = 0; i < num_dim_spatial; ++i) for(ck::index_t i = 0; i < num_dim_spatial_; ++i)
{ {
// XEff = (X - 1) * conv_dilation_w + 1; // XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const ck::index_t idx_eff = (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; const ck::index_t idx_eff =
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
out_spatial_len[i] = out_spatial_len[i] =
(input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) / (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - idx_eff) /
conv_filter_strides[i] + conv_filter_strides_[i] +
1; 1;
} }
return out_spatial_len; return out_spatial_len;
...@@ -116,40 +117,40 @@ ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[ ...@@ -116,40 +117,40 @@ ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[
{ {
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
params.num_dim_spatial = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
params.N = std::stoi(argv[arg_idx++]); params.N_ = std::stoi(argv[arg_idx++]);
params.K = std::stoi(argv[arg_idx++]); params.K_ = std::stoi(argv[arg_idx++]);
params.C = std::stoi(argv[arg_idx++]); params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths.resize(num_dim_spatial); params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_spatial_lengths.resize(num_dim_spatial); params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_strides.resize(num_dim_spatial); params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_dilations.resize(num_dim_spatial); params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_left_pads.resize(num_dim_spatial); params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_left_pads[i] = std::stoi(argv[arg_idx++]); params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_right_pads.resize(num_dim_spatial); params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_right_pads[i] = std::stoi(argv[arg_idx++]); params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
} }
return params; return params;
...@@ -228,12 +229,12 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::siz ...@@ -228,12 +229,12 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::siz
std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p) std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p)
{ {
os << "ConvParams {" os << "ConvParams {"
<< "\nnum_dim_spatial: " << p.num_dim_spatial << "\nN: " << p.N << "\nK: " << p.K << "\nnum_dim_spatial: " << p.num_dim_spatial_ << "\nN: " << p.N_ << "\nK: " << p.K_
<< "\nC: " << p.C << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths << "\nC: " << p.C_ << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths_
<< "\ninput_spatial_lengths: " << p.input_spatial_lengths << "\ninput_spatial_lengths: " << p.input_spatial_lengths_
<< "\nconv_filter_strides: " << p.conv_filter_strides << "\nconv_filter_strides: " << p.conv_filter_strides_
<< "\nconv_filter_dilations: " << p.conv_filter_dilations << "\nconv_filter_dilations: " << p.conv_filter_dilations_
<< "\ninput_left_pads: " << p.input_left_pads << "\ninput_left_pads: " << p.input_left_pads_
<< "\ninput_right_pads: " << p.input_right_pads; << "\ninput_right_pads: " << p.input_right_pads_;
return os; return os;
} }
...@@ -43,7 +43,7 @@ set(PROFILER_SOURCE ...@@ -43,7 +43,7 @@ set(PROFILER_SOURCE
add_executable(ckProfiler ${PROFILER_SOURCE}) add_executable(ckProfiler ${PROFILER_SOURCE})
target_link_libraries(ckProfiler PRIVATE host_tensor) target_link_libraries(ckProfiler PRIVATE host_tensor)
target_link_libraries(ckProfiler PRIVATE conv_fwd_util) target_link_libraries(ckProfiler PRIVATE conv_util)
target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
......
...@@ -63,7 +63,7 @@ template <typename ADataType, ...@@ -63,7 +63,7 @@ template <typename ADataType,
bool profile_batched_gemm_impl(int do_verification, bool profile_batched_gemm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
int M, int M,
int N, int N,
int K, int K,
...@@ -356,11 +356,12 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -356,11 +356,12 @@ bool profile_batched_gemm_impl(int do_verification,
{ {
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * BatchCount * M * N * K; std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N) * sizeof(CDataType) * M * N) *
BatchCount; BatchCount;
......
...@@ -53,7 +53,7 @@ template <typename ADataType, ...@@ -53,7 +53,7 @@ template <typename ADataType,
bool profile_batched_gemm_reduce_impl(int do_verification, bool profile_batched_gemm_reduce_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
int M, int M,
int N, int N,
int K, int K,
...@@ -259,30 +259,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -259,30 +259,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// warm up // init DO, D1 to 0
invoker_ptr->Run(argument_ptr.get()); d0_device_buf.SetZero();
d1_device_buf.SetZero();
// timing float ave_time =
float total_time = 0; invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
for(int i = 0; i < nrepeat; ++i)
{
// init DO, D1 to 0
d0_device_buf.SetZero();
d1_device_buf.SetZero();
KernelTimer timer;
timer.Start();
invoker_ptr->Run(argument_ptr.get());
timer.End();
total_time += timer.GetElapsedTime();
}
float ave_time = total_time / nrepeat;
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
......
...@@ -51,7 +51,7 @@ template <int NDimSpatial, ...@@ -51,7 +51,7 @@ template <int NDimSpatial,
void profile_conv_bwd_data_impl(int do_verification, void profile_conv_bwd_data_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
ck::index_t N, ck::index_t N,
ck::index_t K, ck::index_t K,
ck::index_t C, ck::index_t C,
...@@ -228,7 +228,8 @@ void profile_conv_bwd_data_impl(int do_verification, ...@@ -228,7 +228,8 @@ void profile_conv_bwd_data_impl(int do_verification,
{ {
std::string conv_name = conv_ptr->GetTypeString(); std::string conv_name = conv_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
......
#pragma once #pragma once
#include "stream_config.hpp"
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
...@@ -43,7 +45,7 @@ template <int NDimSpatial, ...@@ -43,7 +45,7 @@ template <int NDimSpatial,
bool profile_conv_bwd_weight_impl(int do_verification, bool profile_conv_bwd_weight_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
ck::index_t N, ck::index_t N,
ck::index_t K, ck::index_t K,
ck::index_t C, ck::index_t C,
...@@ -182,6 +184,7 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -182,6 +184,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
// profile device Conv instances // profile device Conv instances
bool pass = true; bool pass = true;
for(auto& conv_ptr : conv_ptrs) for(auto& conv_ptr : conv_ptrs)
{ {
// using atomic, so need to reset input // using atomic, so need to reset input
...@@ -189,6 +192,7 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -189,6 +192,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.SetZero(); wei_device_buf.SetZero();
} }
auto argument_ptr = conv_ptr->MakeArgumentPointer( auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
...@@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{ {
std::string conv_name = conv_ptr->GetTypeString(); std::string conv_name = conv_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
...@@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data()); wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result); float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
if(max_error > 8) if(max_error > 8)
{ {
pass = false; pass = false;
......
...@@ -42,7 +42,7 @@ template <int NDimSpatial, ...@@ -42,7 +42,7 @@ template <int NDimSpatial,
void profile_conv_fwd_bias_relu_add_impl(int do_verification, void profile_conv_fwd_bias_relu_add_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
ck::index_t N, ck::index_t N,
ck::index_t K, ck::index_t K,
ck::index_t C, ck::index_t C,
...@@ -219,7 +219,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -219,7 +219,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{ {
std::string conv_name = op_ptr->GetTypeString(); std::string conv_name = op_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
......
...@@ -119,7 +119,7 @@ template <int NDimSpatial, ...@@ -119,7 +119,7 @@ template <int NDimSpatial,
void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification, void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
ck::index_t N, ck::index_t N,
ck::index_t K, ck::index_t K,
ck::index_t C, ck::index_t C,
...@@ -275,7 +275,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification, ...@@ -275,7 +275,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
{ {
std::string conv_name = op_ptr->GetTypeString(); std::string conv_name = op_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
......
...@@ -41,7 +41,7 @@ template <int NDimSpatial, ...@@ -41,7 +41,7 @@ template <int NDimSpatial,
void profile_conv_fwd_bias_relu_impl(int do_verification, void profile_conv_fwd_bias_relu_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
ck::index_t N, ck::index_t N,
ck::index_t K, ck::index_t K,
ck::index_t C, ck::index_t C,
...@@ -207,7 +207,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -207,7 +207,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{ {
std::string conv_name = op_ptr->GetTypeString(); std::string conv_name = op_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
......
#pragma once #pragma once
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "conv_fwd_util.hpp" #include "conv_util.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
...@@ -269,7 +269,7 @@ template <int NDimSpatial, ...@@ -269,7 +269,7 @@ template <int NDimSpatial,
bool profile_convnd_bwd_data_impl(int do_verification, bool profile_convnd_bwd_data_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
ck::index_t N, ck::index_t N,
ck::index_t K, ck::index_t K,
ck::index_t C, ck::index_t C,
...@@ -410,7 +410,8 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -410,7 +410,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
{ {
std::string conv_name = conv_ptr->GetTypeString(); std::string conv_name = conv_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t flop =
ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths); ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
......
...@@ -65,7 +65,7 @@ template <typename ADataType, ...@@ -65,7 +65,7 @@ template <typename ADataType,
void profile_gemm_bias_2d_impl(int do_verification, void profile_gemm_bias_2d_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
int nrepeat, bool time_kernel,
int M, int M,
int N, int N,
int K, int K,
...@@ -259,7 +259,8 @@ void profile_gemm_bias_2d_impl(int do_verification, ...@@ -259,7 +259,8 @@ void profile_gemm_bias_2d_impl(int do_verification,
{ {
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = std::size_t(2) * M * N * K;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment