"scripts/git@developer.sourcefind.cn:change/sglang.git" did not exist on "9e2f7252db8f2e1b903dba31484f7efb0b772c41"
Commit 14315b72 authored by Chao Liu's avatar Chao Liu
Browse files

tweaking

parent ebe38f3d
......@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
0,
b_thread_data_on_global,
0})
#if 1
#if 0
.template Run<Float, Float, address_space_t::generic, address_space_t::global>
#else // tweaking
.template Run_optimized_dst_address_calculation<Float,
......
......@@ -78,6 +78,11 @@ struct NativeTensorCoordinate
return coord;
}
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
{
return tensor_desc_type::CalculateOffsetDiff(idx_diff);
}
__host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
private:
......@@ -170,7 +175,18 @@ struct TransformedTensorCoordinate
return coord_up;
}
// this function should be inexpensive, because there is no upper-to-lower index transformation
// Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time.
__host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
return GetLowerCoordinate().CalculateOffsetDiff(tensor_desc_type::CalculateLowerIndexDiff(
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()));
}
__host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
{
return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) &&
......@@ -193,7 +209,7 @@ struct TensorCoordinate
private:
template <typename... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......@@ -201,7 +217,7 @@ struct TensorCoordinate
template <typename... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
......@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
private:
template <class... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
}
template <class... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
}
......
......@@ -226,6 +226,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
#if 0 // debug
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_sequence("src_linear_dim_mask", src_linear_dim_mask);
print_sequence("src_nonlinear_dim_mask", src_nonlinear_dim_mask);
}
#endif
static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
long_vector_size == SrcDataPerAccess,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
......@@ -292,9 +300,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// TODO: is this good implementation?
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#else
#elif 0
const index_t src_linear_offset =
SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
SrcDesc::CalculateOffset(make_zero_array<index_t, nDim>());
#elif 1
const index_t src_linear_offset =
src_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#endif
// Check src vector's padding situation, only check the first data in
......@@ -384,6 +396,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
#if 0 // debug
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_sequence("dst_linear_dim_mask", dst_linear_dim_mask);
print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask);
}
#endif
static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
long_vector_size == DstDataPerAccess,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
......@@ -477,13 +497,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is dst compile-time offset
#if 1
#if 0
// TODO: is this good implementation?
const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#else
#elif 0
const index_t dst_linear_offset =
DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
DstDesc::CalculateOffset(make_zero_array<index_t, nDim>());
#elif 1
const index_t dst_linear_offset =
DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
dst_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#endif
// Check dst vector's padding situation, only check the first data in
......
......@@ -74,20 +74,20 @@ int main(int argc, char* argv[])
{
using namespace ck;
#if 0
constexpr index_t N = 64;
constexpr index_t C = 256;
constexpr index_t HI = 56;
constexpr index_t WI = 56;
constexpr index_t K = 256;
#if 1
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
constexpr index_t X = 7;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
using LeftPads = Sequence<0, 3>;
using RightPads = Sequence<0, 3>;
#elif 0
// 3x3, 34x34
constexpr index_t N = 64;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment