Unverified Commit 30072aec authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Restructure gridwise and blockwise GEMM, add tensor contraction and FWD-v4r5 (#36)

* experimenting magic number division

* overhauling fwd-v4r4 to clearly reflect transformation graph

* added fwd-v4r5

* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2

* bug fix and added sanity-check in transform_dynamic_tensor_descriptor

* added conv_driver_v2
parent 71d6b19d
...@@ -74,7 +74,7 @@ __host__ __device__ constexpr auto integer_divide_floor(X x, Y y) ...@@ -74,7 +74,7 @@ __host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
template <class X, class Y> template <class X, class Y>
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y) __host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
{ {
return (x + y - 1) / y; return (x + y - Number<1>{}) / y;
} }
template <class X, class Y> template <class X, class Y>
......
...@@ -26,5 +26,11 @@ __host__ __device__ constexpr auto generate_sequence_v2(F&& f, Number<N>) ...@@ -26,5 +26,11 @@ __host__ __device__ constexpr auto generate_sequence_v2(F&& f, Number<N>)
typename arithmetic_sequence_gen<0, N, 1>::type{}); typename arithmetic_sequence_gen<0, N, 1>::type{});
} }
template <index_t... Is>
__host__ __device__ constexpr auto to_sequence(Tuple<Number<Is>...>)
{
return Sequence<Is...>{};
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -99,7 +99,8 @@ __host__ __device__ void print_multi_index(const Tuple<Xs...>& x) ...@@ -99,7 +99,8 @@ __host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
printf("{"); printf("{");
printf("MultiIndex, "); printf("MultiIndex, ");
printf("size %d,", index_t{sizeof...(Xs)}); printf("size %d,", index_t{sizeof...(Xs)});
static_for<0, sizeof...(Xs), 1>{}([&](auto i) { printf("%d ", index_t{x.At(i)}); }); static_for<0, sizeof...(Xs), 1>{}(
[&](auto i) { printf("%d ", static_cast<index_t>(x.At(i))); });
printf("}"); printf("}");
} }
......
...@@ -16,6 +16,7 @@ install(TARGETS host LIBRARY DESTINATION lib) ...@@ -16,6 +16,7 @@ install(TARGETS host LIBRARY DESTINATION lib)
if(DEVICE_BACKEND STREQUAL "AMD") if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE src/conv_driver.cpp) set(CONV_SOURCE src/conv_driver.cpp)
set(CONV_V2_SOURCE src/conv_driver_v2.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp) set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE src/conv_driver.cu) set(CONV_SOURCE src/conv_driver.cu)
...@@ -23,7 +24,9 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA") ...@@ -23,7 +24,9 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
endif() endif()
add_executable(conv_driver ${CONV_SOURCE}) add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_driver_v2 ${CONV_V2_SOURCE})
add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
target_link_libraries(conv_driver PRIVATE host) target_link_libraries(conv_driver PRIVATE host)
target_link_libraries(conv_driver_v2 PRIVATE host)
target_link_libraries(conv_bwd_data_driver PRIVATE host) target_link_libraries(conv_bwd_data_driver PRIVATE host)
...@@ -2,15 +2,25 @@ ...@@ -2,15 +2,25 @@
#define CONV_COMMON_HPP #define CONV_COMMON_HPP
#include "tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
enum ConvTensorLayout
{
NCHW,
NHWC,
CHWN,
NCHWc,
NHWCc
};
template <class InDesc, template <class InDesc,
class WeiDesc, class WeiDesc,
class ConvStrides, class ConvStrides,
class ConvDilations, class ConvDilations,
class LowerPads, class LeftPads,
class UpperPads> class RightPads>
constexpr auto get_convolution_output_default_4d_tensor_descriptor( constexpr auto get_convolution_output_default_4d_tensor_descriptor(
InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads) InDesc, WeiDesc, ConvStrides, ConvDilations, LeftPads, RightPads)
{ {
using namespace ck; using namespace ck;
...@@ -35,21 +45,69 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor( ...@@ -35,21 +45,69 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
constexpr index_t Y = wei_desc.GetLength(I2); constexpr index_t Y = wei_desc.GetLength(I2);
constexpr index_t X = wei_desc.GetLength(I3); constexpr index_t X = wei_desc.GetLength(I3);
constexpr index_t HPadLow = LowerPads{}.Get(I0); constexpr index_t LeftPadH = LeftPads{}.Get(I0);
constexpr index_t WPadLow = LowerPads{}.Get(I1); constexpr index_t LeftPadW = LeftPads{}.Get(I1);
constexpr index_t HPadUp = UpperPads{}.Get(I0); constexpr index_t RightPadH = RightPads{}.Get(I0);
constexpr index_t WPadUp = UpperPads{}.Get(I1); constexpr index_t RightPadW = RightPads{}.Get(I1);
constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1; constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1;
constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1; constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1;
constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1; constexpr index_t Ho = (Hi + LeftPadH + RightPadH - YEff) / ConvStrides{}[0] + 1;
constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1; constexpr index_t Wo = (Wi + LeftPadW + RightPadW - XEff) / ConvStrides{}[1] + 1;
return make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{}); return make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
} }
template <typename... InDesc,
typename... WeiDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
constexpr auto get_convolution_output_default_4d_tensor_descriptor(
const ck::DynamicTensorDescriptor<InDesc...>& in_desc,
const ck::DynamicTensorDescriptor<WeiDesc...>& wei_desc,
const ConvStrides& conv_strides,
const ConvDilations conv_dilations,
const LeftPads& left_pads,
const RightPads& right_pads)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
assert(in_desc.GetNumOfDimension() == 4);
assert(wei_desc.GetNumOfDimension() == 4);
assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1));
const auto N = in_desc.GetLength(I0);
const auto Hi = in_desc.GetLength(I2);
const auto Wi = in_desc.GetLength(I3);
const auto K = wei_desc.GetLength(I0);
const auto Y = wei_desc.GetLength(I2);
const auto X = wei_desc.GetLength(I3);
const auto LeftPadH = left_pads[I0];
const auto LeftPadW = left_pads[I1];
const auto RightPadH = right_pads[I0];
const auto RightPadW = right_pads[I1];
const auto YEff = (Y - I1) * conv_dilations[I0] + I1;
const auto XEff = (X - I1) * conv_dilations[I1] + I1;
const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
}
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
constexpr std::size_t constexpr std::size_t
calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc) calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc)
......
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
#include "driver_dynamic_contraction_v1r1.hpp"
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw(
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
ck::index_t nrepeat)
{
using namespace ck;
std::cout << __func__ << std::endl;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
#if 1
// cdata = 64, BlockSize = 256, [8, 1, 128] * [8, 4, 32] = [1, 128, 4, 32]
constexpr index_t BlockSize = 256;
constexpr index_t N0 = 4;
constexpr index_t GemmGM1PerBlockGM11 = 128;
constexpr index_t GemmGN1PerBlockGN11 = 32;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmM1PerThreadM111 = 4;
constexpr index_t GemmN1PerThreadN111 = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
using GemmABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11 = Sequence<4, 1, 1, 1>;
using GemmABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11 = Sequence<2, 1, 1, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GK = 4;
constexpr index_t GemmABlockTransferDstScalarPerVector_GM11 = 1;
using GemmBBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11 = Sequence<1, 4, 1, 1>;
using GemmBBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11 = Sequence<8, 1, 1, 32>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GN11 = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GN11 = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_BN1 = 1;
#elif 1
// cdata = 64, BlockSize = 256, [8, 1, 128] * [8, 8, 16] = [1, 128, 8, 16]
constexpr index_t BlockSize = 256;
constexpr index_t N0 = 8;
constexpr index_t GemmGM1PerBlockGM11 = 128;
constexpr index_t GemmGN1PerBlockGN11 = 16;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmM1PerThreadM111 = 4;
constexpr index_t GemmN1PerThreadN111 = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
using GemmABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11 = Sequence<4, 1, 1, 1>;
using GemmABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11 = Sequence<2, 1, 1, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GK = 4;
constexpr index_t GemmABlockTransferDstScalarPerVector_GM11 = 1;
using GemmBBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11 = Sequence<1, 4, 1, 1>;
using GemmBBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11 = Sequence<8, 2, 1, 16>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GN11 = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GN11 = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_BN1 = 1;
#endif
const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto wei_gk_gm0_gm1_grid_desc = descs[I0];
const auto in_gk_gn0_gn1_grid_desc = descs[I1];
const auto out_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr auto wei_gk_gm0_gm10_gm11_grid_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{}));
constexpr auto in_gk_gn0_gn10_gn11_grid_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
constexpr auto out_gm10_bm0_bm1_gn10_bn0_bn1_grid_iterator_hacks = make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}));
constexpr auto wei_gk_gm0_gm10_gm11_grid_move_slice_window_iterator_hacks =
Sequence<0, 0, 0, 0, 0, 0>{};
constexpr auto in_gk_gn0_gn10_gn11_grid_move_slice_window_iterator_hacks =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0>{};
for(index_t i = 0; i < 5; ++i)
{
float ave_time = driver_dynamic_contraction_v1r1<
BlockSize,
TInWei,
TAcc,
TOut,
InMemoryDataOperation::Set,
decltype(wei_gk_gm0_gm1_grid_desc),
decltype(in_gk_gn0_gn1_grid_desc),
decltype(out_gm0_gm1_gn0_gn1_grid_desc),
GemmGM1PerBlockGM11,
GemmGN1PerBlockGN11,
GemmKPerBlock,
GemmM1PerThreadM111,
GemmN1PerThreadN111,
GemmKPerThread,
GemmM11N11ThreadClusterM1100,
GemmM11N11ThreadClusterN1100,
GemmM11N11ThreadClusterM1101,
GemmM11N11ThreadClusterN1101,
GemmABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11,
GemmABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11,
Sequence<3, 2, 1, 0>, // ABlockTransferThreadClusterArrangeOrder
Sequence<3, 2, 1, 0>, // ABlockTransferSrcAccessOrder
0, // ABlockTransferSrcVectorDim
GemmABlockTransferSrcScalarPerVector_GK,
GemmABlockTransferDstScalarPerVector_GM11,
false, // don't move back src coordinate after threadwise copy
GemmBBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11,
GemmBBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11,
Sequence<0, 3, 2, 1>, // BBlockTransferThreadClusterArrangeOrder
Sequence<0, 3, 2, 1>, // BBlockTransferSrcAccessOrder
3, // BBlockTransferSrcVectorDim
GemmBBlockTransferSrcScalarPerVector_GN11,
GemmBBlockTransferDstScalarPerVector_GN11,
false, // don't move back src coordinate after threadwise copy
Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
5, // CThreadTransferSrcDstVectorDim
GemmCThreadTransferDstScalarPerVector_BN1,
decltype(wei_gk_gm0_gm10_gm11_grid_iterator_hacks),
decltype(in_gk_gn0_gn10_gn11_grid_iterator_hacks),
decltype(out_gm10_bm0_bm1_gn10_bn0_bn1_grid_iterator_hacks),
decltype(wei_gk_gm0_gm10_gm11_grid_move_slice_window_iterator_hacks),
decltype(in_gk_gn0_gn10_gn11_grid_move_slice_window_iterator_hacks)>(
static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
wei_gk_gm0_gm1_grid_desc,
in_gk_gn0_gn1_grid_desc,
out_gm0_gm1_gn0_gn1_grid_desc,
wei_gk_gm0_gm10_gm11_grid_iterator_hacks,
in_gk_gn0_gn10_gn11_grid_iterator_hacks,
out_gm10_bm0_bm1_gn10_bn0_bn1_grid_iterator_hacks,
wei_gk_gm0_gm10_gm11_grid_move_slice_window_iterator_hacks,
in_gk_gn0_gn10_gn11_grid_move_slice_window_iterator_hacks,
nrepeat);
float perf = (float)calculate_convolution_flops(
in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
}
// copy result back to host
out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
...@@ -4,97 +4,64 @@ ...@@ -4,97 +4,64 @@
#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp" #include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp" #include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp"
template <class TInWei, template <typename TInWei,
ck::index_t InWeiVectorSize, ck::index_t InWeiVectorSize,
class TAcc, typename TAcc,
class TOut, typename TOut,
class InDesc, typename InLengths,
class WeiDesc, typename WeiLengths,
class OutDesc, typename OutLengths,
class ConvStrides, typename ConvStrides,
class ConvDilations, typename ConvDilations,
class InLeftPads, typename InLeftPads,
class InRightPads> typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
InDesc, const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi, const Tensor<TInWei>& in_n_c_hi_wi,
WeiDesc,
const Tensor<TInWei>& wei_k_c_y_x, const Tensor<TInWei>& wei_k_c_y_x,
OutDesc,
Tensor<TOut>& out_n_k_ho_wo, Tensor<TOut>& out_n_k_ho_wo,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
std::cout << "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw" std::cout << __func__ << std::endl;
<< std::endl;
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto N = OutDesc::GetLengths()[I0]; const auto N = out_n_k_ho_wo_lengths[I0];
constexpr auto K = OutDesc::GetLengths()[I1]; const auto K = out_n_k_ho_wo_lengths[I1];
constexpr auto C = WeiDesc::GetLengths()[I1]; const auto C = wei_k_c_y_x_lengths[I1];
constexpr auto Hi = InDesc::GetLengths()[I2]; const auto Hi = in_n_c_hi_wi_lengths[I2];
constexpr auto Wi = InDesc::GetLengths()[I3]; const auto Wi = in_n_c_hi_wi_lengths[I3];
constexpr auto Ho = OutDesc::GetLengths()[I2]; const auto Ho = out_n_k_ho_wo_lengths[I2];
constexpr auto Wo = OutDesc::GetLengths()[I3]; const auto Wo = out_n_k_ho_wo_lengths[I3];
constexpr auto Y = WeiDesc::GetLengths()[I2]; const auto Y = wei_k_c_y_x_lengths[I2];
constexpr auto X = WeiDesc::GetLengths()[I3]; const auto X = wei_k_c_y_x_lengths[I3];
constexpr auto C0 = C / Number<InWeiVectorSize>{}; const auto C0 = C / Number<InWeiVectorSize>{};
constexpr auto C1 = Number<InWeiVectorSize>{}; const auto C1 = Number<InWeiVectorSize>{};
constexpr auto K0 = K / Number<InWeiVectorSize>{}; const auto K0 = K / Number<InWeiVectorSize>{};
constexpr auto K1 = Number<InWeiVectorSize>{}; const auto K1 = Number<InWeiVectorSize>{};
#if 0 Tensor<TInWei> in_n_c0_hi_wi_c1(
// run-time variables HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
const auto in_n_c0_hi_wi_desc = Tensor<TInWei> wei_k_c0_y_x_c1(
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, C0, Hi, Wi)); HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
const auto wei_k_c0_y_x_desc = Tensor<TOut> out_n_k0_ho_wo_k1(
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(K, C0, Y, X)); HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
const auto out_n_k0_ho_wo_k1_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, K0, Ho, Wo, K1));
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
#else
// compile-time variables
const auto in_n_c0_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
const auto wei_k_c0_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
const auto out_n_k0_ho_wo_k1_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
const auto conv_strides = sequence_to_tuple_of_number(ConvStrides{});
const auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
const auto in_left_pads = sequence_to_tuple_of_number(InLeftPads{});
const auto in_right_pads = sequence_to_tuple_of_number(InRightPads{});
#endif
Tensor<TInWei> in_n_c0_hi_wi_c1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<N, C0, Hi, Wi, C1>{})));
Tensor<TInWei> wei_k_c0_y_x_c1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<K, C0, Y, X, C1>{})));
Tensor<TOut> out_n_k0_ho_wo_k1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<N, K0, Ho, Wo, K1>{})));
auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) { auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) = in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
...@@ -109,17 +76,30 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -109,17 +76,30 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)(); make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)(); make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
in_n_c_hi_wi_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
wei_k_c_y_x_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
const auto in_n_c0_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
const auto wei_k_c0_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
const auto out_n_k0_ho_wo_k1_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
#if 1 #if 1
// cdata = 64, BlockSize = 64, 16x8x32x4 // cdata = 64, BlockSize = 64, 16x8x32x4
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
constexpr index_t KPerBlock = K; constexpr index_t KPerBlock = 16;
constexpr index_t HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32; constexpr index_t WoPerBlock = 32;
constexpr index_t EPerBlock = C0; constexpr index_t EPerBlock = 1;
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
...@@ -134,7 +114,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -134,7 +114,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1; constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
constexpr index_t CThreadTransferDstScalarPerVector_W = K1; constexpr index_t CThreadTransferDstScalarPerVector_W = 16;
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, ""); static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
#else #else
...@@ -165,17 +145,28 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -165,17 +145,28 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 #if 0
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad< DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad< DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
#endif #endif
BlockSize, <BlockSize,
typename vector_type<TInWei, InWeiVectorSize>::type, TAcc, TOut, KPerBlock, typename vector_type<TInWei, InWeiVectorSize>::type,
HoPerBlock, WoPerBlock, EPerBlock, KPerThread, HoPerThread, WoPerThread, TAcc,
EPerThread, ABlockTransferThreadSliceLengths_E_K, TOut,
ABlockTransferThreadClusterLengths_E_K, ABlockTransferSrcScalarPerVector_E, KPerBlock,
ABlockTransferDstScalarPerVector_K, BThreadTransferSrcScalarPerVector_W, HoPerBlock,
CThreadTransferDstScalarPerVector_W > {}; WoPerBlock,
EPerBlock,
KPerThread,
HoPerThread,
WoPerThread,
EPerThread,
ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K,
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W>{};
conv_driver.Run(wei_k_c0_y_x_desc, conv_driver.Run(wei_k_c0_y_x_desc,
in_n_c0_hi_wi_desc, in_n_c0_hi_wi_desc,
...@@ -185,12 +176,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -185,12 +176,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
in_left_pads, in_left_pads,
in_right_pads, in_right_pads,
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
wei_k_c_y_x_device_buf.GetDeviceBuffer()), wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c_hi_wi_device_buf.GetDeviceBuffer()), in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer())); static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()));
out_n_k_ho_wo_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data()); out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) { auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
out_n_k_ho_wo(n, k, ho, wo) = out_n_k_ho_wo(n, k, ho, wo) =
......
...@@ -6,58 +6,94 @@ template <class TIn, ...@@ -6,58 +6,94 @@ template <class TIn,
class TOut, class TOut,
class ConvStrides, class ConvStrides,
class ConvDilations, class ConvDilations,
class LowerPads, class InLeftPads,
class UpperPads> class InRightPads>
void host_direct_convolution(const Tensor<TIn>& in_nchw, void host_direct_convolution(const Tensor<TIn>& in,
const Tensor<TWei>& wei_kcyx, const Tensor<TWei>& wei,
Tensor<TOut>& out_nkhw, Tensor<TOut>& out,
ConvStrides, const ConvStrides& conv_strides,
ConvDilations, const ConvDilations& conv_dilations,
LowerPads, const InLeftPads& in_left_pads,
UpperPads) const InRightPads& in_right_pads,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{ {
using namespace ck; using namespace ck;
index_t h_pad_low = LowerPads{}.Get(Number<0>{}); constexpr auto I0 = Number<0>{};
index_t w_pad_low = LowerPads{}.Get(Number<1>{}); constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
auto f = [&](auto n, auto k, auto ho, auto wo) { auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
double v = 0; double v = 0;
for(int c = 0; c < wei_kcyx.mDesc.GetLengths()[1]; ++c) for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{ {
for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y) for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{ {
int hi = ho * ConvStrides{}[0] + y * ConvDilations{}[0] - h_pad_low; int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x) for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{ {
int wi = wo * ConvStrides{}[1] + x * ConvDilations{}[1] - w_pad_low; int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 && if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_nchw.mDesc.GetLengths()[3]) wi < in.mDesc.GetLengths()[3])
{ {
v += static_cast<const double>(in_nchw(n, c, hi, wi)) * v += static_cast<const double>(in(n, c, hi, wi)) *
static_cast<const double>(wei_kcyx(k, c, y, x)); static_cast<const double>(wei(k, c, y, x));
} }
} }
} }
} }
out_nkhw(n, k, ho, wo) = v; out(n, k, ho, wo) = v;
}; };
auto f_par = make_ParallelTensorFunctor(f, auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
out_nkhw.mDesc.GetLengths()[0], double v = 0;
out_nkhw.mDesc.GetLengths()[1], for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
out_nkhw.mDesc.GetLengths()[2], {
out_nkhw.mDesc.GetLengths()[3]); for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
wi < in.mDesc.GetLengths()[2])
{
v += static_cast<const double>(in(n, hi, wi, c)) *
static_cast<const double>(wei(k, y, x, c));
}
}
}
}
out(n, ho, wo, k) = v;
};
f_par(std::thread::hardware_concurrency()); switch(layout)
{
case ConvTensorLayout::NCHW:
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break;
case ConvTensorLayout::NHWC:
make_ParallelTensorFunctor(f_nhwc,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break;
default: throw std::runtime_error("wrong! not supported layout");
}
} }
template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads> template <class TIn, class TWei, class TOut, class InLeftPads, class InRightPads>
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx, const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw, Tensor<TOut>& out_nkhw,
LowerPads, InLeftPads,
UpperPads) InRightPads)
{ {
using namespace ck; using namespace ck;
...@@ -76,8 +112,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -76,8 +112,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std::size_t HO = out_nkhw.mDesc.GetLengths()[2]; std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
std::size_t WO = out_nkhw.mDesc.GetLengths()[3]; std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
index_t h_pad_low = LowerPads{}.Get(Number<0>{}); index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{}); index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
std::size_t HiPerTile = HoPerTile + Y - 1; std::size_t HiPerTile = HoPerTile + Y - 1;
std::size_t WiPerTile = WoPerTile + X - 1; std::size_t WiPerTile = WoPerTile + X - 1;
......
...@@ -271,19 +271,20 @@ struct Tensor ...@@ -271,19 +271,20 @@ struct Tensor
std::vector<T> mData; std::vector<T> mData;
}; };
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout) template <typename X>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
{ {
os << "dim " << desc.GetNumOfDimension() << ", "; this->CalculateStrides();
}
os << "lengths {";
LogRange(os, desc.GetLengths(), ", ");
os << "}, ";
os << "strides {"; template <typename X, typename Y>
LogRange(os, desc.GetStrides(), ", "); HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
os << "}" << std::endl; : mLens(lens), mStrides(strides)
{
} }
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
template <class T> template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result) void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{ {
......
...@@ -44,7 +44,7 @@ struct GeneratorTensor_Checkboard ...@@ -44,7 +44,7 @@ struct GeneratorTensor_Checkboard
template <class... Ts> template <class... Ts>
double operator()(Ts... Xs) const double operator()(Ts... Xs) const
{ {
std::array<ck::index_t, sizeof...(Ts)> dims = {{Xs...}}; std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
return std::accumulate(dims.begin(), return std::accumulate(dims.begin(),
dims.end(), dims.end(),
true, true,
......
This diff is collapsed.
This diff is collapsed.
...@@ -3,18 +3,6 @@ ...@@ -3,18 +3,6 @@
#include "host_tensor.hpp" #include "host_tensor.hpp"
template <typename X>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
{
this->CalculateStrides();
}
template <typename X, typename Y>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
: mLens(lens), mStrides(strides)
{
}
void HostTensorDescriptor::CalculateStrides() void HostTensorDescriptor::CalculateStrides()
{ {
mStrides.clear(); mStrides.clear();
...@@ -45,3 +33,16 @@ std::size_t HostTensorDescriptor::GetElementSpace() const ...@@ -45,3 +33,16 @@ std::size_t HostTensorDescriptor::GetElementSpace() const
const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; } const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; }
const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; } const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os)
{
os << "dim " << desc.GetNumOfDimension() << ", ";
os << "lengths {";
LogRange(os, desc.GetLengths(), ", ");
os << "}, ";
os << "strides {";
LogRange(os, desc.GetStrides(), ", ");
os << "}" << std::endl;
}
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment