Commit e8d6434d authored by wangshaojie6's avatar wangshaojie6
Browse files

1. remove comments. 2. add checkvalidity. 3. add gridsize computation

parent f26fb605
......@@ -325,7 +325,6 @@ int main(int argc, char* argv[])
// alloc work space
size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
float conv_ave_time = 0.f;
float type_convert_ave_time = 0.f;
DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
wei_work_space_device_buf.SetZero();
......@@ -341,42 +340,6 @@ int main(int argc, char* argv[])
conv_ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
#if 0
// do type convert
auto type_convert = DeviceUnaryElementwiseTypeConvertInstance{};
auto type_convert_invoker = type_convert.MakeInvokerPointer();
int tensor_size =
std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
auto type_convert_argument =
type_convert.MakeArgumentPointer(wei_work_space_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
{tensor_size},
{1},
{1},
UnaryTypeConvert{});
if(!type_convert.IsSupportedArgument(type_convert_argument.get()))
{
std::cout << "wrong! device_type_convert with the specified compilation parameters does "
"not support this convert problem"
<< std::endl;
return 1;
}
type_convert_ave_time =
type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
// type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
#endif
// host code to check if conv give me a right result
// Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
// ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
// wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
// const auto type_cvt_functor = [&](AccDataType a) {
// return ck::type_convert<WeiDataType, AccDataType>(a);
// };
// host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
// wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims,
// type_cvt_functor);
std::size_t flop = ck::utils::conv::get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
......@@ -391,8 +354,8 @@ int main(int argc, char* argv[])
float gb_per_sec = num_btype / 1.E6 / conv_ave_time;
std::cout << "Perf: conv: " << conv_ave_time << " ms, type_convert: " << type_convert_ave_time
<< " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;
std::cout << "Perf: conv: " << conv_ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s" << std::endl;
if(do_verification)
{
......
......@@ -1051,8 +1051,16 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
int tensor_size =
std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
GridDesc_M0 a_grid_desc_m0_ = MakeDescriptor_M0<1>({tensor_size}, {1}, 240, 256);
GridDesc_M0 b_grid_desc_m0_ = MakeDescriptor_M0<1>({tensor_size}, {1}, 240, 256);
const index_t type_convert_grid_size = GridwiseUEltwise::CalculateGridSize(tensor_size);
GridDesc_M0 a_grid_desc_m0_ =
MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
GridDesc_M0 b_grid_desc_m0_ =
MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
if(!GridwiseUEltwise::CheckValidity(a_grid_desc_m0_, b_grid_desc_m0_))
{
throw std::runtime_error("wrong! GridwiseUnaryElementwise_1D has invalid setting");
}
// run kernel for type conversion
void* p_c_grid_tmp_ = static_cast<void*>(arg.p_c_grid_);
......@@ -1061,7 +1069,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
float elapsed_time =
launch_and_time_kernel(stream_config,
kernel,
dim3(240),
dim3(type_convert_grid_size),
dim3(256),
0,
static_cast<AccDataType*>(arg.p_c_workspace_grid_),
......
......@@ -60,7 +60,7 @@ struct DeviceUnaryElementwise : public BaseOperator
}
using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
using GridwiseBinEltwise = GridwiseUnaryElementwise_1D<ADataType,
using GridwiseUEltwise = GridwiseUnaryElementwise_1D<ADataType,
BDataType,
GridDesc_M0,
ElementwiseFunctor,
......@@ -78,9 +78,11 @@ struct DeviceUnaryElementwise : public BaseOperator
p_b_(p_b),
shape_(shape),
functor_(functor),
blockSize_(256),
gridSize_(240) // FIXME - Calculate the grid size by number of CU in the future
blockSize_(256) // FIXME - Calculate the grid size by number of CU in the future
{
index_t tensor_size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
gridSize_ = GridwiseUEltwise::CalculateGridSize(tensor_size);
a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
}
......@@ -99,7 +101,7 @@ struct DeviceUnaryElementwise : public BaseOperator
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto kernel = kernel_unary_elementwise_1d<GridwiseBinEltwise,
const auto kernel = kernel_unary_elementwise_1d<GridwiseUEltwise,
ADataType,
BDataType,
GridDesc_M0,
......
......@@ -40,6 +40,19 @@ struct GridwiseUnaryElementwise_1D
return make_multi_index(global_thread_id * ScalarPerVector);
}
__host__ __device__ static constexpr bool CheckValidity(const GridDesc_M0 a_grid_desc_m0,
const GridDesc_M0 b_grid_desc_m0)
{
return a_grid_desc_m0.GetLength(I0) == b_grid_desc_m0.GetLength(I0);
}
__host__ __device__ static constexpr index_t CalculateGridSize(const index_t tensor_size)
{
const index_t grid_size = math::integer_divide_ceil(tensor_size, 256 * ScalarPerVector);
return grid_size;
}
__device__ static void Run(const ADataType* __restrict__ p_a_global,
BDataType* __restrict__ p_b_global,
const GridDesc_M0 a_grid_desc_m0,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment