"test/git@developer.sourcefind.cn:change/sglang.git" did not exist on "7d0edf3caed4e10b9e2b4217f34a1a6700d32b74"
Commit 012d3a07 authored by Chao Liu's avatar Chao Liu
Browse files

tweaking

parent 14315b72
...@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
0, 0,
b_thread_data_on_global, b_thread_data_on_global,
0}) 0})
#if 0 #if 1
.template Run<Float, Float, address_space_t::generic, address_space_t::global> .template Run<Float, Float, address_space_t::generic, address_space_t::global>
#else // tweaking #else // tweaking
.template Run_optimized_dst_address_calculation<Float, .template Run_optimized_dst_address_calculation<Float,
......
...@@ -78,10 +78,12 @@ struct NativeTensorCoordinate ...@@ -78,10 +78,12 @@ struct NativeTensorCoordinate
return coord; return coord;
} }
#if 0 // tweaking
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff) __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
{ {
return tensor_desc_type::CalculateOffsetDiff(idx_diff); return tensor_desc_type::CalculateOffsetDiff(idx_diff);
} }
#endif
__host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
...@@ -175,6 +177,7 @@ struct TransformedTensorCoordinate ...@@ -175,6 +177,7 @@ struct TransformedTensorCoordinate
return coord_up; return coord_up;
} }
#if 0 // tweaking
// Calculate offset diff without updating tensor-coordinate // Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions, // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time. // then all calculation can be done at compile-time.
...@@ -183,9 +186,12 @@ struct TransformedTensorCoordinate ...@@ -183,9 +186,12 @@ struct TransformedTensorCoordinate
// For transformation of multi-index difference, not all transformation functions need to // For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the // know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not. // transformation function. The transformation function itself decides to use them or not.
return GetLowerCoordinate().CalculateOffsetDiff(tensor_desc_type::CalculateLowerIndexDiff( const auto idx_low_diff = tensor_desc_type::CalculateLowerIndexDiff(
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex())); idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
} }
#endif
__host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
{ {
...@@ -209,7 +215,7 @@ struct TensorCoordinate ...@@ -209,7 +215,7 @@ struct TensorCoordinate
private: private:
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{ {
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>( return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
...@@ -217,7 +223,7 @@ struct TensorCoordinate ...@@ -217,7 +223,7 @@ struct TensorCoordinate
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{ {
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>( return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
...@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated ...@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{ {
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>(); return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{ {
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>(); return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
} }
......
...@@ -319,7 +319,7 @@ struct TransformedTensorDescriptor ...@@ -319,7 +319,7 @@ struct TransformedTensorDescriptor
return idx_low; return idx_low;
} }
// TODO: right now return value is constexpr because use of non-constepxr lambda // TODO: right now return value is not constexpr because use of non-constepxr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff( __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old) const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
{ {
......
...@@ -73,7 +73,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -73,7 +73,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const ThreadBufferData* p_thread_buffer) const
{ {
#if 0 #if 1
mThreadwiseLoad.template Run<BlockSrcData, mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData, ThreadBufferData,
BlockSrcAddressSpace, BlockSrcAddressSpace,
...@@ -94,11 +94,11 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -94,11 +94,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const BlockDstData* p_block_dst) const
{ {
#if 0 #if 1
mThreadwiseStore.template Run<ThreadBufferData, mThreadwiseStore.template Run<ThreadBufferData,
BlockDstData, BlockDstData,
ThreadBufferAddressSpace, ThreadBufferAddressSpace,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst); BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
#else // tweaking #else // tweaking
mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData, mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
BlockDstData, BlockDstData,
......
...@@ -226,14 +226,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -226,14 +226,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask(); constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask(); constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
#if 0 // debug
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_sequence("src_linear_dim_mask", src_linear_dim_mask);
print_sequence("src_nonlinear_dim_mask", src_nonlinear_dim_mask);
}
#endif
static_assert(src_linear_dim_mask.At(VectorAccessDim) || static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
long_vector_size == SrcDataPerAccess, long_vector_size == SrcDataPerAccess,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance " "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
...@@ -295,18 +287,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -295,18 +287,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto src_coord = const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id); src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is src compile-time offset #if 1 // tweaking
#if 0 // this is src compile-time offset
// TODO: is this good implementation?
const index_t src_linear_offset = const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#elif 0 #else
const index_t src_linear_offset = // this is src compile-time offset
SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
SrcDesc::CalculateOffset(make_zero_array<index_t, nDim>());
#elif 1
const index_t src_linear_offset = const index_t src_linear_offset =
src_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#endif #endif
// Check src vector's padding situation, only check the first data in // Check src vector's padding situation, only check the first data in
...@@ -396,14 +384,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -396,14 +384,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask(); constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask(); constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
#if 0 // debug
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_sequence("dst_linear_dim_mask", dst_linear_dim_mask);
print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask);
}
#endif
static_assert(dst_linear_dim_mask.At(VectorAccessDim) || static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
long_vector_size == DstDataPerAccess, long_vector_size == DstDataPerAccess,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance " "Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
...@@ -496,18 +476,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -496,18 +476,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto dst_coord = const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is dst compile-time offset #if 1 // tweaking
#if 0 // this is dst compile-time offset
// TODO: is this good implementation?
const index_t dst_linear_offset = const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#elif 0 #else
const index_t dst_linear_offset = // this is dst compile-time offset
DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
DstDesc::CalculateOffset(make_zero_array<index_t, nDim>());
#elif 1
const index_t dst_linear_offset = const index_t dst_linear_offset =
dst_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#endif #endif
// Check dst vector's padding situation, only check the first data in // Check dst vector's padding situation, only check the first data in
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0 #define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 0 #define CK_USE_AMD_INTRINSIC 1
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
......
...@@ -74,7 +74,7 @@ int main(int argc, char* argv[]) ...@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 0
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
constexpr index_t HI = 17; constexpr index_t HI = 17;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment