Commit c6b52884 authored by wangshaojie6's avatar wangshaojie6
Browse files

add unary type convert to bwd-weight example

parent c4b6b9b1
......@@ -15,6 +15,7 @@
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "element_wise_operation.hpp"
#include "device_unary_elementwise.hpp"
#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_backward_weight.hpp"
......@@ -30,6 +31,11 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryTypeConvert = ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>;
using DeviceUnaryElementwiseTypeConvertInstance = ck::tensor_operation::device::
DeviceUnaryElementwise<AccDataType, WeiDataType, UnaryTypeConvert, 1, 4>;
static constexpr auto ConvBwdWeightDefault =
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
......@@ -95,7 +101,7 @@ void host_elementwise(HostTensorB& B,
Functor functor)
{
size_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
std::cout << __LINE__ << ":" << tensor_size << ", "<< A.mData[0] << std::endl;
std::cout << __LINE__ << ":" << tensor_size << ", " << A.mData[0] << std::endl;
for(std::size_t n = 0; n < tensor_size; ++n)
{
B.mData[n] = functor(A.mData[n]);
......@@ -318,7 +324,8 @@ int main(int argc, char* argv[])
// alloc work space
size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
float ave_time = 0.f;
float conv_ave_time = 0.f;
float type_convert_ave_time = 0.f;
DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
wei_work_space_device_buf.SetZero();
......@@ -349,17 +356,42 @@ int main(int argc, char* argv[])
return 1;
}
ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
conv_ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
// do type convert
auto type_convert = DeviceUnaryElementwiseTypeConvertInstance{};
auto type_convert_invoker = type_convert.MakeInvokerPointer();
int tensor_size =
std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
auto type_convert_argument =
type_convert.MakeArgumentPointer(wei_work_space_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
{tensor_size},
{1},
{1},
UnaryTypeConvert{});
if(!type_convert.IsSupportedArgument(type_convert_argument.get()))
{
std::cout << "wrong! device_type_convert with the specified compilation parameters does "
"not support this convert problem"
<< std::endl;
return 1;
}
type_convert_ave_time =
type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
// type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
// host code to check if conv give me a right result
Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
const auto type_cvt_functor = [&](AccDataType a) {
return ck::type_convert<WeiDataType, AccDataType>(a);
};
host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims, type_cvt_functor);
// Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
// ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
// wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
// const auto type_cvt_functor = [&](AccDataType a) {
// return ck::type_convert<WeiDataType, AccDataType>(a);
// };
// host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
// wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims,
// type_cvt_functor);
std::size_t flop = ck::utils::conv::get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
......@@ -371,12 +403,12 @@ int main(int argc, char* argv[])
params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float tflops = static_cast<float>(flop) / 1.E9 / conv_ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
float gb_per_sec = num_btype / 1.E6 / conv_ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
std::cout << "Perf: conv: " << conv_ave_time << " ms, type_convert: " << type_convert_ave_time
<< " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;
if(do_verification)
{
......@@ -396,7 +428,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
//wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
if(do_log)
{
......
......@@ -12,7 +12,6 @@ namespace device {
template <typename ADataType,
typename BDataType,
typename ComputeDataType,
typename ElementwiseFunctor,
index_t Dim,
index_t ScalarPerVector>
......@@ -63,7 +62,6 @@ struct DeviceUnaryElementwise : public BaseOperator
using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
using GridwiseBinEltwise = GridwiseUnaryElementwise_1D<ADataType,
BDataType,
ComputeDataType,
GridDesc_M0,
ElementwiseFunctor,
ScalarPerVector>;
......@@ -81,7 +79,7 @@ struct DeviceUnaryElementwise : public BaseOperator
shape_(shape),
functor_(functor),
blockSize_(256),
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
gridSize_(240) // FIXME - Calculate the grid size by number of CU in the future
{
a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
......
......@@ -335,11 +335,20 @@ struct UnaryTypeConvert;
template <>
struct UnaryTypeConvert<float, ck::bhalf_t>
{
__host__ __device__ UnaryTypeConvert(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, ck::bhalf_t& x) const { y = ck::type_convert<float, ck::bhalf_t>(x); };
__host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
{
y = ck::type_convert<float, ck::bhalf_t>(x);
};
};
template <>
struct UnaryTypeConvert<ck::bhalf_t, float>
{
__host__ __device__ void operator()(ck::bhalf_t& y, float& x) const
{
y = ck::type_convert<ck::bhalf_t, float>(x);
};
};
} // namespace element_wise
} // namespace tensor_operation
......
......@@ -18,16 +18,11 @@ __global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_gl
const GridDesc_M0 b_grid_desc_m0,
const ElementwiseFunctor functor)
{
GridwiseUEltwise::Run(p_a_global,
p_b_global,
a_grid_desc_m0,
b_grid_desc_m0,
functor);
GridwiseUEltwise::Run(p_a_global, p_b_global, a_grid_desc_m0, b_grid_desc_m0, functor);
}
template <typename ADataType,
typename BDataType,
typename ComputeDataType,
typename GridDesc_M0,
typename ElementwiseFunctor,
index_t ScalarPerVector>
......@@ -46,11 +41,9 @@ struct GridwiseUnaryElementwise_1D
}
__device__ static void Run(const ADataType* __restrict__ p_a_global,
const BDataType* __restrict__ p_b_global,
CDataType* __restrict__ p_c_global,
BDataType* __restrict__ p_b_global,
const GridDesc_M0 a_grid_desc_m0,
const GridDesc_M0 b_grid_desc_m0,
const GridDesc_M0 c_grid_desc_m0,
const ElementwiseFunctor functor)
{
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
......@@ -58,14 +51,14 @@ struct GridwiseUnaryElementwise_1D
auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_global, b_grid_desc_m0.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> a_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> b_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;
const auto thread_store_global_offset = CalculateElementwiseIndex();
auto a_global_load =
ThreadwiseTensorSliceTransfer_v2<ADataType,
ComputeDataType,
ADataType,
GridDesc_M0,
decltype(thread_desc_m0),
Sequence<ScalarPerVector>, // SliceLengths
......@@ -76,7 +69,7 @@ struct GridwiseUnaryElementwise_1D
false>{a_grid_desc_m0, thread_store_global_offset};
auto b_global_write =
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
ThreadwiseTensorSliceTransfer_v1r3<BDataType,
BDataType,
decltype(thread_desc_m0),
GridDesc_M0,
......@@ -92,7 +85,7 @@ struct GridwiseUnaryElementwise_1D
const index_t blockSize = get_block_size();
const index_t blockPerGrid = get_grid_size();
const auto m0 = c_grid_desc_m0.GetLength(I0);
const auto m0 = b_grid_desc_m0.GetLength(I0);
const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector;
const auto loop_step_index = make_multi_index(loop_step);
......@@ -105,8 +98,7 @@ struct GridwiseUnaryElementwise_1D
static_for<0, ScalarPerVector, 1>{}([&](auto m) {
constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
functor(b_thread_buf(Number<offset>{}),
a_thread_buf(Number<offset>{}));
functor(b_thread_buf(Number<offset>{}), a_thread_buf(Number<offset>{}));
});
b_global_write.Run(thread_desc_m0,
......
......@@ -111,6 +111,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
}
else
{
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment