"vscode:/vscode.git/clone" did not exist on "02e7eec91131cec2050a0bfee3eb07f37298491f"
Commit 0a35a4e1 authored by Chao Liu's avatar Chao Liu
Browse files

refactor host code

parent a34977df
...@@ -93,8 +93,9 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw ...@@ -93,8 +93,9 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides()); Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());
constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( constexpr auto out_nkhw_thread_desc =
in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); get_convolution_output_default_4d_tensor_descriptor_deprecated(
in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);
// register // register
Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
......
...@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto ...@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto
template <typename... Ts> template <typename... Ts>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>) make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
{ {
using TDesc = ConstantTensorDescriptor_deprecated<Ts...>; using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
......
...@@ -228,7 +228,7 @@ struct TensorCoordinate ...@@ -228,7 +228,7 @@ struct TensorCoordinate
private: private:
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{ {
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>( return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
...@@ -236,7 +236,7 @@ struct TensorCoordinate ...@@ -236,7 +236,7 @@ struct TensorCoordinate
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{ {
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>( return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
...@@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated ...@@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
{ {
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>(); return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
{ {
return MergedTensorCoordinate_deprecated< return MergedTensorCoordinate_deprecated<
ConstantMergedTensorDescriptor_deprecated<Ts...>>(); ConstantMergedTensorDescriptor_deprecated<Ts...>>();
......
...@@ -64,10 +64,10 @@ template <typename LowerTensorDescriptor, ...@@ -64,10 +64,10 @@ template <typename LowerTensorDescriptor,
index_t... LowerDimensionIds, index_t... LowerDimensionIds,
index_t... UpperDimensionIds> index_t... UpperDimensionIds>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
Sequence<LowerLengths...>, Sequence<LowerLengths...>,
Sequence<LowerDimensionIds...>, Sequence<LowerDimensionIds...>,
Sequence<UpperDimensionIds...>) Sequence<UpperDimensionIds...>)
{ {
return TransformedTensorDescriptor<LowerTensorDescriptor, return TransformedTensorDescriptor<LowerTensorDescriptor,
Tuple<PassThrough<LowerLengths>...>, Tuple<PassThrough<LowerLengths>...>,
...@@ -78,7 +78,7 @@ __host__ __device__ constexpr auto ...@@ -78,7 +78,7 @@ __host__ __device__ constexpr auto
// reorder a NativeTensorDescriptor // reorder a NativeTensorDescriptor
template <typename... Ts, typename MapLower2Upper> template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper) reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
{ {
static_assert(is_valid_sequence_map<MapLower2Upper>{}, static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map"); "wrong! MapLower2Upper is not a valid map");
...@@ -96,7 +96,7 @@ __host__ __device__ constexpr auto ...@@ -96,7 +96,7 @@ __host__ __device__ constexpr auto
// reorder a TransformedTensorDescriptor // reorder a TransformedTensorDescriptor
template <typename... Ts, typename MapLower2Upper> template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper) reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
{ {
static_assert(is_valid_sequence_map<MapLower2Upper>{}, static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map"); "wrong! MapLower2Upper is not a valid map");
......
...@@ -2,10 +2,16 @@ ...@@ -2,10 +2,16 @@
#define CONV_COMMON_HPP #define CONV_COMMON_HPP
#include "ConstantTensorDescriptor_deprecated.hpp" #include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp"
// this is ugly, only for 4d template <class InDesc,
template <class InDesc, class WeiDesc> class WeiDesc,
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc) class ConvStrides,
class ConvDilations,
class LowerPads,
class UpperPads>
constexpr auto get_convolution_output_default_4d_tensor_descriptor_deprecated(
InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
{ {
using namespace ck; using namespace ck;
...@@ -22,18 +28,27 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe ...@@ -22,18 +28,27 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1), static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
"input & weight dimension not consistent"); "input & weight dimension not consistent");
constexpr auto N = in_desc.GetLength(I0); constexpr index_t N = in_desc.GetLength(I0);
constexpr auto HI = in_desc.GetLength(I2); constexpr index_t Hi = in_desc.GetLength(I2);
constexpr auto WI = in_desc.GetLength(I3); constexpr index_t Wi = in_desc.GetLength(I3);
constexpr index_t K = wei_desc.GetLength(I0);
constexpr index_t Y = wei_desc.GetLength(I2);
constexpr index_t X = wei_desc.GetLength(I3);
constexpr index_t HPadLow = LowerPads{}.Get(I0);
constexpr index_t WPadLow = LowerPads{}.Get(I1);
constexpr auto K = wei_desc.GetLength(I0); constexpr index_t HPadUp = UpperPads{}.Get(I0);
constexpr auto Y = wei_desc.GetLength(I2); constexpr index_t WPadUp = UpperPads{}.Get(I1);
constexpr auto X = wei_desc.GetLength(I3);
constexpr auto HO = HI + 1 - Y; constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1;
constexpr auto WO = WI + 1 - X; constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1;
constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
} }
template <class InDesc, template <class InDesc,
...@@ -42,7 +57,7 @@ template <class InDesc, ...@@ -42,7 +57,7 @@ template <class InDesc,
class ConvDilations, class ConvDilations,
class LowerPads, class LowerPads,
class UpperPads> class UpperPads>
constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( constexpr auto get_convolution_output_default_4d_tensor_descriptor(
InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads) InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
{ {
using namespace ck; using namespace ck;
...@@ -80,7 +95,7 @@ constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( ...@@ -80,7 +95,7 @@ constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1; constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1; constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{}); return make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
} }
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
......
#pragma once
#include "tensor.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "tensor_descriptor.hpp"
template <typename ConstTensorDesc, std::size_t... Is>
auto make_TensorDescriptor_impl(ConstTensorDesc, std::integer_sequence<std::size_t, Is...>)
{
std::initializer_list<std::size_t> lengths = {ConstTensorDesc::GetLength(Is)...};
std::initializer_list<std::size_t> strides = {ConstTensorDesc::GetStride(Is)...};
return TensorDescriptor(lengths, strides);
}
template <typename ConstTensorDesc>
auto make_TensorDescriptor(ConstTensorDesc)
{
return make_TensorDescriptor_impl(
ConstTensorDesc{},
std::make_integer_sequence<std::size_t, ConstTensorDesc::GetNumOfDimension()>{});
}
template <typename ConstTensorDesc>
void ostream_ConstantTensorDescriptor(ConstTensorDesc, std::ostream& os = std::cout)
{
ostream_TensorDescriptor(make_TensorDescriptor(ConstTensorDesc{}), os);
}
#pragma once #pragma once
#include "tensor.hpp" #include "tensor.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
// this is ugly, only for 4d
template <class TConstTensorDesc>
void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
{
using namespace ck;
static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto desc = TConstTensorDesc{};
os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
<< desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
<< "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
<< desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
}
// this is ugly, only for 4d
template <class TConstTensorDesc>
auto make_TensorDescriptor(TConstTensorDesc)
{
using namespace ck;
static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto desc = TConstTensorDesc{};
std::initializer_list<index_t> lengths = {
desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
std::initializer_list<index_t> strides = {
desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};
return TensorDescriptor(lengths, strides);
}
template <class TIn, template <class TIn,
class TWei, class TWei,
...@@ -331,25 +287,3 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -331,25 +287,3 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread); make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread); make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
} }
template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
}
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
...@@ -68,10 +68,12 @@ auto construct_f_unpack_args(F, T args) ...@@ -68,10 +68,12 @@ auto construct_f_unpack_args(F, T args)
struct TensorDescriptor struct TensorDescriptor
{ {
TensorDescriptor() = delete; TensorDescriptor() = delete;
TensorDescriptor(std::initializer_list<std::size_t> lens);
TensorDescriptor(std::initializer_list<std::size_t> lens, template <typename X>
std::initializer_list<std::size_t> strides); TensorDescriptor(std::vector<X> lens);
TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
template <typename X, typename Y>
TensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
void CalculateStrides(); void CalculateStrides();
...@@ -269,4 +271,39 @@ struct Tensor ...@@ -269,4 +271,39 @@ struct Tensor
std::vector<T> mData; std::vector<T> mData;
}; };
void ostream_TensorDescriptor(const TensorDescriptor& desc, std::ostream& os = std::cout)
{
os << "dim " << desc.GetNumOfDimension() << ", ";
os << "lengths {";
LogRange(os, desc.GetLengths(), ", ");
os << "}, ";
os << "strides {";
LogRange(os, desc.GetStrides(), ", ");
os << "}" << std::endl;
}
template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
}
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
#endif #endif
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "tensor_generator.hpp" #include "tensor_generator.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp" #include "host_conv.hpp"
#include "device_tensor.hpp"
//#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" //#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp" //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
...@@ -24,7 +25,6 @@ ...@@ -24,7 +25,6 @@
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp" #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
...@@ -315,7 +315,7 @@ int main(int argc, char* argv[]) ...@@ -315,7 +315,7 @@ int main(int argc, char* argv[])
auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{}); auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{}); auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated(
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
......
...@@ -3,12 +3,14 @@ ...@@ -3,12 +3,14 @@
#include "tensor.hpp" #include "tensor.hpp"
TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens) template <typename X>
TensorDescriptor::TensorDescriptor(std::vector<X> lens) : mLens(lens)
{ {
this->CalculateStrides(); this->CalculateStrides();
} }
TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides) template <typename X, typename Y>
TensorDescriptor::TensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
: mLens(lens), mStrides(strides) : mLens(lens), mStrides(strides)
{ {
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment