Commit 32d485dd authored by Chao Liu's avatar Chao Liu
Browse files

refactor DynamicBuffer

parent b6e43b25
......@@ -1487,20 +1487,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_data_coord);
#if 0
// TODO: this is slooooooooow due to VGPR over-allocation
src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
is_src_valid ? src_buf.template AsType<src_vector_t>()[src_data_coord.GetOffset() /
SrcScalarPerVector]
is_src_valid ? src_buf.template Get<src_vector_t>(src_data_coord.GetOffset())
: src_vector_t{0};
#else
// TODO: this is workaround. this has normal performance but it's hacky
src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
is_src_valid
? *reinterpret_cast<const src_vector_t*>(&(reinterpret_cast<const SrcData*>(
src_buf.p_data_)[src_data_coord.GetOffset()]))
: src_vector_t{0};
#endif
// copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
// DstData)
......
......@@ -29,28 +29,22 @@ struct DynamicBuffer
{
using type = T;
template <typename X>
struct PointerWrapper
{
X* p_;
__host__ __device__ constexpr const X& operator[](index_t i) const { return p_[i]; }
__host__ __device__ constexpr X& operator()(index_t i) { return p_[i]; }
};
T* p_data_;
__host__ __device__ constexpr DynamicBuffer(T* p_data) : p_data_{p_data} {}
__host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
__host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
template <typename X,
typename std::enable_if<
is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
bool>::type = false>
__host__ __device__ constexpr const auto AsType() const
__host__ __device__ constexpr const auto Get(index_t i) const
{
return PointerWrapper<X>{reinterpret_cast<X*>(p_data_)};
return *reinterpret_cast<const X*>(&p_data_[i]);
}
template <typename X,
......@@ -58,9 +52,9 @@ struct DynamicBuffer
is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
bool>::type = false>
__host__ __device__ constexpr auto AsType()
__host__ __device__ void Set(index_t i, const X& x)
{
return PointerWrapper<X>{reinterpret_cast<X*>(p_data_)};
*reinterpret_cast<X*>(&p_data_[i]) = x;
}
__host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
......
......@@ -14,7 +14,7 @@
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
#if 0
#if 1
#define CK_AMD_GPU_GFX906 1
#elif 0
#define CK_AMD_GPU_GFX908 1
......
......@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t HI = 1080;
......@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 0
#if 1
using in_data_t = float;
constexpr index_t in_vector_size = 1;
using acc_data_t = float;
......@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
LeftPads{},
RightPads{},
nrepeat);
#elif 0
#elif 1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
in_vector_size,
acc_data_t,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment