Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
012d3a07
"test/git@developer.sourcefind.cn:change/sglang.git" did not exist on "7d0edf3caed4e10b9e2b4217f34a1a6700d32b74"
Commit
012d3a07
authored
Sep 27, 2019
by
Chao Liu
Browse files
tweaking
parent
14315b72
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
31 additions
and
49 deletions
+31
-49
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+10
-4
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+5
-5
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+10
-34
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+1
-1
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
012d3a07
...
@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
0
,
0
,
b_thread_data_on_global
,
b_thread_data_on_global
,
0
})
0
})
#if
0
#if
1
.
template
Run
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
.
template
Run
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#else // tweaking
#else // tweaking
.
template
Run_optimized_dst_address_calculation
<
Float
,
.
template
Run_optimized_dst_address_calculation
<
Float
,
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
012d3a07
...
@@ -78,10 +78,12 @@ struct NativeTensorCoordinate
...
@@ -78,10 +78,12 @@ struct NativeTensorCoordinate
return
coord
;
return
coord
;
}
}
#if 0 // tweaking
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
{
{
return tensor_desc_type::CalculateOffsetDiff(idx_diff);
return tensor_desc_type::CalculateOffsetDiff(idx_diff);
}
}
#endif
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
...
@@ -175,6 +177,7 @@ struct TransformedTensorCoordinate
...
@@ -175,6 +177,7 @@ struct TransformedTensorCoordinate
return
coord_up
;
return
coord_up
;
}
}
#if 0 // tweaking
// Calculate offset diff without updating tensor-coordinate
// Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time.
// then all calculation can be done at compile-time.
...
@@ -183,9 +186,12 @@ struct TransformedTensorCoordinate
...
@@ -183,9 +186,12 @@ struct TransformedTensorCoordinate
// For transformation of multi-index difference, not all transformation functions need to
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
// transformation function. The transformation function itself decides to use them or not.
return
GetLowerCoordinate
().
CalculateOffsetDiff
(
tensor_desc_type
::
CalculateLowerIndexDiff
(
const auto idx_low_diff = tensor_desc_type::CalculateLowerIndexDiff(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
()));
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
}
}
#endif
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidOffset
()
const
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidOffset
()
const
{
{
...
@@ -209,7 +215,7 @@ struct TensorCoordinate
...
@@ -209,7 +215,7 @@ struct TensorCoordinate
private:
private:
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
@@ -217,7 +223,7 @@ struct TensorCoordinate
...
@@ -217,7 +223,7 @@ struct TensorCoordinate
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
...
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
View file @
012d3a07
...
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
...
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
private:
private:
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
{
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor
<
Ts
...
>>
();
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor
<
Ts
...
>>
();
}
}
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
{
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
}
}
...
...
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
012d3a07
...
@@ -319,7 +319,7 @@ struct TransformedTensorDescriptor
...
@@ -319,7 +319,7 @@ struct TransformedTensorDescriptor
return
idx_low
;
return
idx_low
;
}
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
// TODO: right now return value is
not
constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexDiff
(
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
idx_up_old
,
const
LowerIndex
&
idx_low_old
)
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
idx_up_old
,
const
LowerIndex
&
idx_low_old
)
{
{
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
012d3a07
...
@@ -73,7 +73,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -73,7 +73,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
)
const
ThreadBufferData
*
p_thread_buffer
)
const
{
{
#if
0
#if
1
mThreadwiseLoad
.
template
Run
<
BlockSrcData
,
mThreadwiseLoad
.
template
Run
<
BlockSrcData
,
ThreadBufferData
,
ThreadBufferData
,
BlockSrcAddressSpace
,
BlockSrcAddressSpace
,
...
@@ -94,11 +94,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -94,11 +94,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
)
const
BlockDstData
*
p_block_dst
)
const
{
{
#if
0
#if
1
mThreadwiseStore
.
template
Run
<
ThreadBufferData
,
mThreadwiseStore
.
template
Run
<
ThreadBufferData
,
BlockDstData,
BlockDstData
,
ThreadBufferAddressSpace,
ThreadBufferAddressSpace
,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
BlockDstAddressSpace
>(
p_thread_buffer
,
p_block_dst
);
#else // tweaking
#else // tweaking
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
ThreadBufferData
,
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
ThreadBufferData
,
BlockDstData
,
BlockDstData
,
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
012d3a07
...
@@ -226,14 +226,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -226,14 +226,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr
auto
src_linear_dim_mask
=
SrcDesc
::
GetLinearDimensionMask
();
constexpr
auto
src_linear_dim_mask
=
SrcDesc
::
GetLinearDimensionMask
();
constexpr
auto
src_nonlinear_dim_mask
=
SrcDesc
::
GetNonLinearDimensionMask
();
constexpr
auto
src_nonlinear_dim_mask
=
SrcDesc
::
GetNonLinearDimensionMask
();
#if 0 // debug
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_sequence("src_linear_dim_mask", src_linear_dim_mask);
print_sequence("src_nonlinear_dim_mask", src_nonlinear_dim_mask);
}
#endif
static_assert
(
src_linear_dim_mask
.
At
(
VectorAccessDim
)
||
static_assert
(
src_linear_dim_mask
.
At
(
VectorAccessDim
)
||
long_vector_size
==
SrcDataPerAccess
,
long_vector_size
==
SrcDataPerAccess
,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
...
@@ -295,18 +287,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -295,18 +287,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
auto
src_coord
=
const
auto
src_coord
=
src_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
src_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
// this is src compile-time offset
#if 1 // tweaking
#if 0
// this is src compile-time offset
// TODO: is this good implementation?
const
index_t
src_linear_offset
=
const
index_t
src_linear_offset
=
src_coord
.
GetOffset
()
-
src_nonlinear_coord
.
GetOffset
();
src_coord
.
GetOffset
()
-
src_nonlinear_coord
.
GetOffset
();
#elif
0
#else
const
index_t
src_linear_offset
=
// this is src compile-time offset
SrcDesc
::
CalculateOffset
(
linear_dim_data_steps
+
scalar_id
)
-
SrcDesc
::
CalculateOffset
(
make_zero_array
<
index_t
,
nDim
>
());
#elif 1
const
index_t
src_linear_offset
=
const
index_t
src_linear_offset
=
src_coord
.
CalculateOffsetDiff
(
linear_dim_data_steps
+
scalar_id
);
src_
nonlinear_
coord
.
CalculateOffsetDiff
(
linear_dim_data_steps
+
scalar_id
);
#endif
#endif
// Check src vector's padding situation, only check the first data in
// Check src vector's padding situation, only check the first data in
...
@@ -396,14 +384,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -396,14 +384,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr
auto
dst_linear_dim_mask
=
DstDesc
::
GetLinearDimensionMask
();
constexpr
auto
dst_linear_dim_mask
=
DstDesc
::
GetLinearDimensionMask
();
constexpr
auto
dst_nonlinear_dim_mask
=
DstDesc
::
GetNonLinearDimensionMask
();
constexpr
auto
dst_nonlinear_dim_mask
=
DstDesc
::
GetNonLinearDimensionMask
();
#if 0 // debug
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_sequence("dst_linear_dim_mask", dst_linear_dim_mask);
print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask);
}
#endif
static_assert
(
dst_linear_dim_mask
.
At
(
VectorAccessDim
)
||
static_assert
(
dst_linear_dim_mask
.
At
(
VectorAccessDim
)
||
long_vector_size
==
DstDataPerAccess
,
long_vector_size
==
DstDataPerAccess
,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
...
@@ -496,18 +476,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -496,18 +476,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
auto
dst_coord
=
const
auto
dst_coord
=
dst_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
dst_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
// this is dst compile-time offset
#if 1 // tweaking
#if 0
// this is dst compile-time offset
// TODO: is this good implementation?
const
index_t
dst_linear_offset
=
const
index_t
dst_linear_offset
=
dst_coord
.
GetOffset
()
-
dst_nonlinear_coord
.
GetOffset
();
dst_coord
.
GetOffset
()
-
dst_nonlinear_coord
.
GetOffset
();
#elif
0
#else
const
index_t
dst_linear_offset
=
// this is dst compile-time offset
DstDesc
::
CalculateOffset
(
linear_dim_data_steps
+
scalar_id
)
-
DstDesc
::
CalculateOffset
(
make_zero_array
<
index_t
,
nDim
>
());
#elif 1
const
index_t
dst_linear_offset
=
const
index_t
dst_linear_offset
=
dst_coord
.
CalculateOffsetDiff
(
linear_dim_data_steps
+
scalar_id
);
dst_
nonlinear_
coord
.
CalculateOffsetDiff
(
linear_dim_data_steps
+
scalar_id
);
#endif
#endif
// Check dst vector's padding situation, only check the first data in
// Check dst vector's padding situation, only check the first data in
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
012d3a07
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC
0
#define CK_USE_AMD_INTRINSIC
1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
...
driver/src/driver.cpp
View file @
012d3a07
...
@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
...
@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if
1
#if
0
constexpr index_t N = 128;
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t HI = 17;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment