Commit c6b52884 authored by wangshaojie6's avatar wangshaojie6
Browse files

add unary type convert to bwd-weight example

parent c4b6b9b1
...@@ -297,7 +297,7 @@ int main(int argc, char* argv[]) ...@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
split_k); split_k);
// alloc work space // alloc work space
float ave_time = 0.f; float ave_time = 0.f;
if(!conv->IsSupportedArgument(argument.get())) if(!conv->IsSupportedArgument(argument.get()))
{ {
std::cout << "wrong! device_conv with the specified compilation parameters does " std::cout << "wrong! device_conv with the specified compilation parameters does "
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "device_unary_elementwise.hpp"
#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" #include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_backward_weight.hpp" #include "reference_conv_backward_weight.hpp"
...@@ -30,6 +31,11 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -30,6 +31,11 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryTypeConvert = ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>;
using DeviceUnaryElementwiseTypeConvertInstance = ck::tensor_operation::device::
DeviceUnaryElementwise<AccDataType, WeiDataType, UnaryTypeConvert, 1, 4>;
static constexpr auto ConvBwdWeightDefault = static constexpr auto ConvBwdWeightDefault =
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default; ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
...@@ -95,7 +101,7 @@ void host_elementwise(HostTensorB& B, ...@@ -95,7 +101,7 @@ void host_elementwise(HostTensorB& B,
Functor functor) Functor functor)
{ {
size_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{}); size_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
std::cout << __LINE__ << ":" << tensor_size << ", "<< A.mData[0] << std::endl; std::cout << __LINE__ << ":" << tensor_size << ", " << A.mData[0] << std::endl;
for(std::size_t n = 0; n < tensor_size; ++n) for(std::size_t n = 0; n < tensor_size; ++n)
{ {
B.mData[n] = functor(A.mData[n]); B.mData[n] = functor(A.mData[n]);
...@@ -318,7 +324,8 @@ int main(int argc, char* argv[]) ...@@ -318,7 +324,8 @@ int main(int argc, char* argv[])
// alloc work space // alloc work space
size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get()); size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
float ave_time = 0.f; float conv_ave_time = 0.f;
float type_convert_ave_time = 0.f;
DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size); DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
wei_work_space_device_buf.SetZero(); wei_work_space_device_buf.SetZero();
...@@ -349,17 +356,42 @@ int main(int argc, char* argv[]) ...@@ -349,17 +356,42 @@ int main(int argc, char* argv[])
return 1; return 1;
} }
ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); conv_ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
// do type convert
auto type_convert = DeviceUnaryElementwiseTypeConvertInstance{};
auto type_convert_invoker = type_convert.MakeInvokerPointer();
int tensor_size =
std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
auto type_convert_argument =
type_convert.MakeArgumentPointer(wei_work_space_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
{tensor_size},
{1},
{1},
UnaryTypeConvert{});
if(!type_convert.IsSupportedArgument(type_convert_argument.get()))
{
std::cout << "wrong! device_type_convert with the specified compilation parameters does "
"not support this convert problem"
<< std::endl;
return 1;
}
type_convert_ave_time =
type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
// type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
// host code to check if conv give me a right result // host code to check if conv give me a right result
Tensor<AccDataType> wei_k_c_y_x_device_result_fp32( // Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); // ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data()); // wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
const auto type_cvt_functor = [&](AccDataType a) { // const auto type_cvt_functor = [&](AccDataType a) {
return ck::type_convert<WeiDataType, AccDataType>(a); // return ck::type_convert<WeiDataType, AccDataType>(a);
}; // };
host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>( // host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims, type_cvt_functor); // wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims,
// type_cvt_functor);
std::size_t flop = ck::utils::conv::get_flops( std::size_t flop = ck::utils::conv::get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
...@@ -371,12 +403,12 @@ int main(int argc, char* argv[]) ...@@ -371,12 +403,12 @@ int main(int argc, char* argv[])
params.filter_spatial_lengths_, params.filter_spatial_lengths_,
output_spatial_lengths); output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / conv_ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / conv_ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" std::cout << "Perf: conv: " << conv_ave_time << " ms, type_convert: " << type_convert_ave_time
<< std::endl; << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;
if(do_verification) if(do_verification)
{ {
...@@ -396,7 +428,7 @@ int main(int argc, char* argv[]) ...@@ -396,7 +428,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
//wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data()); wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
if(do_log) if(do_log)
{ {
......
...@@ -12,7 +12,6 @@ namespace device { ...@@ -12,7 +12,6 @@ namespace device {
template <typename ADataType, template <typename ADataType,
typename BDataType, typename BDataType,
typename ComputeDataType,
typename ElementwiseFunctor, typename ElementwiseFunctor,
index_t Dim, index_t Dim,
index_t ScalarPerVector> index_t ScalarPerVector>
...@@ -62,11 +61,10 @@ struct DeviceUnaryElementwise : public BaseOperator ...@@ -62,11 +61,10 @@ struct DeviceUnaryElementwise : public BaseOperator
using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1)); using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
using GridwiseBinEltwise = GridwiseUnaryElementwise_1D<ADataType, using GridwiseBinEltwise = GridwiseUnaryElementwise_1D<ADataType,
BDataType, BDataType,
ComputeDataType, GridDesc_M0,
GridDesc_M0, ElementwiseFunctor,
ElementwiseFunctor, ScalarPerVector>;
ScalarPerVector>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
...@@ -81,7 +79,7 @@ struct DeviceUnaryElementwise : public BaseOperator ...@@ -81,7 +79,7 @@ struct DeviceUnaryElementwise : public BaseOperator
shape_(shape), shape_(shape),
functor_(functor), functor_(functor),
blockSize_(256), blockSize_(256),
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future gridSize_(240) // FIXME - Calculate the grid size by number of CU in the future
{ {
a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_); a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_); b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
...@@ -102,10 +100,10 @@ struct DeviceUnaryElementwise : public BaseOperator ...@@ -102,10 +100,10 @@ struct DeviceUnaryElementwise : public BaseOperator
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{ {
const auto kernel = kernel_unary_elementwise_1d<GridwiseBinEltwise, const auto kernel = kernel_unary_elementwise_1d<GridwiseBinEltwise,
ADataType, ADataType,
BDataType, BDataType,
GridDesc_M0, GridDesc_M0,
ElementwiseFunctor>; ElementwiseFunctor>;
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
kernel, kernel,
......
...@@ -335,11 +335,20 @@ struct UnaryTypeConvert; ...@@ -335,11 +335,20 @@ struct UnaryTypeConvert;
template <> template <>
struct UnaryTypeConvert<float, ck::bhalf_t> struct UnaryTypeConvert<float, ck::bhalf_t>
{ {
__host__ __device__ UnaryTypeConvert(const int32_t divider = 1) { (void)divider; }; __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
{
__host__ __device__ void operator()(float& y, ck::bhalf_t& x) const { y = ck::type_convert<float, ck::bhalf_t>(x); }; y = ck::type_convert<float, ck::bhalf_t>(x);
};
}; };
template <>
struct UnaryTypeConvert<ck::bhalf_t, float>
{
__host__ __device__ void operator()(ck::bhalf_t& y, float& x) const
{
y = ck::type_convert<ck::bhalf_t, float>(x);
};
};
} // namespace element_wise } // namespace element_wise
} // namespace tensor_operation } // namespace tensor_operation
......
...@@ -13,21 +13,16 @@ template <typename GridwiseUEltwise, ...@@ -13,21 +13,16 @@ template <typename GridwiseUEltwise,
typename GridDesc_M0, typename GridDesc_M0,
typename ElementwiseFunctor> typename ElementwiseFunctor>
__global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_global, __global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_global,
BDataType* __restrict__ p_b_global, BDataType* __restrict__ p_b_global,
const GridDesc_M0 a_grid_desc_m0, const GridDesc_M0 a_grid_desc_m0,
const GridDesc_M0 b_grid_desc_m0, const GridDesc_M0 b_grid_desc_m0,
const ElementwiseFunctor functor) const ElementwiseFunctor functor)
{ {
GridwiseUEltwise::Run(p_a_global, GridwiseUEltwise::Run(p_a_global, p_b_global, a_grid_desc_m0, b_grid_desc_m0, functor);
p_b_global,
a_grid_desc_m0,
b_grid_desc_m0,
functor);
} }
template <typename ADataType, template <typename ADataType,
typename BDataType, typename BDataType,
typename ComputeDataType,
typename GridDesc_M0, typename GridDesc_M0,
typename ElementwiseFunctor, typename ElementwiseFunctor,
index_t ScalarPerVector> index_t ScalarPerVector>
...@@ -46,11 +41,9 @@ struct GridwiseUnaryElementwise_1D ...@@ -46,11 +41,9 @@ struct GridwiseUnaryElementwise_1D
} }
__device__ static void Run(const ADataType* __restrict__ p_a_global, __device__ static void Run(const ADataType* __restrict__ p_a_global,
const BDataType* __restrict__ p_b_global, BDataType* __restrict__ p_b_global,
CDataType* __restrict__ p_c_global,
const GridDesc_M0 a_grid_desc_m0, const GridDesc_M0 a_grid_desc_m0,
const GridDesc_M0 b_grid_desc_m0, const GridDesc_M0 b_grid_desc_m0,
const GridDesc_M0 c_grid_desc_m0,
const ElementwiseFunctor functor) const ElementwiseFunctor functor)
{ {
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
...@@ -58,14 +51,14 @@ struct GridwiseUnaryElementwise_1D ...@@ -58,14 +51,14 @@ struct GridwiseUnaryElementwise_1D
auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_global, b_grid_desc_m0.GetElementSpaceSize()); p_b_global, b_grid_desc_m0.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> a_thread_buf; StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> b_thread_buf; StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;
const auto thread_store_global_offset = CalculateElementwiseIndex(); const auto thread_store_global_offset = CalculateElementwiseIndex();
auto a_global_load = auto a_global_load =
ThreadwiseTensorSliceTransfer_v2<ADataType, ThreadwiseTensorSliceTransfer_v2<ADataType,
ComputeDataType, ADataType,
GridDesc_M0, GridDesc_M0,
decltype(thread_desc_m0), decltype(thread_desc_m0),
Sequence<ScalarPerVector>, // SliceLengths Sequence<ScalarPerVector>, // SliceLengths
...@@ -76,7 +69,7 @@ struct GridwiseUnaryElementwise_1D ...@@ -76,7 +69,7 @@ struct GridwiseUnaryElementwise_1D
false>{a_grid_desc_m0, thread_store_global_offset}; false>{a_grid_desc_m0, thread_store_global_offset};
auto b_global_write = auto b_global_write =
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType, ThreadwiseTensorSliceTransfer_v1r3<BDataType,
BDataType, BDataType,
decltype(thread_desc_m0), decltype(thread_desc_m0),
GridDesc_M0, GridDesc_M0,
...@@ -92,7 +85,7 @@ struct GridwiseUnaryElementwise_1D ...@@ -92,7 +85,7 @@ struct GridwiseUnaryElementwise_1D
const index_t blockSize = get_block_size(); const index_t blockSize = get_block_size();
const index_t blockPerGrid = get_grid_size(); const index_t blockPerGrid = get_grid_size();
const auto m0 = c_grid_desc_m0.GetLength(I0); const auto m0 = b_grid_desc_m0.GetLength(I0);
const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector; const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector;
const auto loop_step_index = make_multi_index(loop_step); const auto loop_step_index = make_multi_index(loop_step);
...@@ -105,8 +98,7 @@ struct GridwiseUnaryElementwise_1D ...@@ -105,8 +98,7 @@ struct GridwiseUnaryElementwise_1D
static_for<0, ScalarPerVector, 1>{}([&](auto m) { static_for<0, ScalarPerVector, 1>{}([&](auto m) {
constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m)); constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
functor(b_thread_buf(Number<offset>{}), functor(b_thread_buf(Number<offset>{}), a_thread_buf(Number<offset>{}));
a_thread_buf(Number<offset>{}));
}); });
b_global_write.Run(thread_desc_m0, b_global_write.Run(thread_desc_m0,
......
...@@ -111,6 +111,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -111,6 +111,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
} }
else else
{ {
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
return 0; return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment