Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
ebe38f3d
"...composable_kernel-1.git" did not exist on "ed3a2e52265e11daa366f47b082141a652b67c58"
Commit
ebe38f3d
authored
Sep 27, 2019
by
Chao Liu
Browse files
debugging
parent
9b280cc5
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
30 additions
and
28 deletions
+30
-28
composable_kernel/include/tensor_description/dimension.hpp
composable_kernel/include/tensor_description/dimension.hpp
+0
-7
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+2
-2
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+3
-4
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
...ration/blockwise_generic_tensor_slice_copy_deprecated.hpp
+4
-4
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+15
-5
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+3
-3
No files found.
composable_kernel/include/tensor_description/dimension.hpp
View file @
ebe38f3d
...
@@ -11,13 +11,6 @@ struct NativeDimension
...
@@ -11,13 +11,6 @@ struct NativeDimension
__host__
__device__
static
constexpr
auto
GetLength
()
{
return
Number
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
GetLength
()
{
return
Number
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
GetStride
()
{
return
Number
<
Stride
>
{};
}
__host__
__device__
static
constexpr
auto
GetStride
()
{
return
Number
<
Stride
>
{};
}
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
index_t
i
)
{
return
i
*
Stride
;
}
__host__
__device__
static
constexpr
index_t
CalculateOffsetDiff
(
index_t
i_diff
)
{
return
i_diff
*
Stride
;
}
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
ebe38f3d
...
@@ -193,7 +193,7 @@ struct TensorCoordinate
...
@@ -193,7 +193,7 @@ struct TensorCoordinate
private:
private:
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
@@ -201,7 +201,7 @@ struct TensorCoordinate
...
@@ -201,7 +201,7 @@ struct TensorCoordinate
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
...
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
View file @
ebe38f3d
...
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
...
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
private:
private:
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
{
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor
<
Ts
...
>>
();
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor
<
Ts
...
>>
();
}
}
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
{
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
}
}
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
ebe38f3d
...
@@ -75,10 +75,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -75,10 +75,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
{
{
#if 0
#if 0
mThreadwiseLoad.template Run<BlockSrcData,
mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData,
ThreadBufferData,
BlockSrcAddressSpace,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src,
ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
p_thread_buffer);
#else
// tweaking
#else
// tweaking
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
BlockSrcData
,
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
BlockSrcData
,
ThreadBufferData
,
ThreadBufferData
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
View file @
ebe38f3d
...
@@ -483,8 +483,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -483,8 +483,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
>
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunLoadThreadBuffer
(
const
TData
*
p_block_src
,
TData
*
p_thread_buffer
)
const
__device__
void
RunLoadThreadBuffer
(
const
TData
*
p_block_src
,
TData
*
p_thread_buffer
)
const
{
{
mThreadwiseLoad
.
Run
<
TData
,
BlockSrcAddressSpace
,
ThreadBufferAddressSpace
>
(
p_block_src
,
mThreadwiseLoad
.
template
Run
<
TData
,
BlockSrcAddressSpace
,
ThreadBufferAddressSpace
>(
p_thread_buffer
);
p_block_src
,
p_thread_buffer
);
}
}
template
<
typename
TData
,
template
<
typename
TData
,
...
@@ -492,8 +492,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -492,8 +492,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunStoreThreadBuffer
(
const
TData
*
p_thread_buffer
,
TData
*
p_block_dst
)
const
__device__
void
RunStoreThreadBuffer
(
const
TData
*
p_thread_buffer
,
TData
*
p_block_dst
)
const
{
{
mThreadwiseStore
.
Run
<
TData
,
ThreadBufferAddressSpace
,
BlockDstAddressSpace
>
(
p_thread_buffer
,
mThreadwiseStore
.
template
Run
<
TData
,
ThreadBufferAddressSpace
,
BlockDstAddressSpace
>(
p_block_dst
);
p_thread_buffer
,
p_block_dst
);
}
}
template
<
typename
TData
,
template
<
typename
TData
,
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
ebe38f3d
...
@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
__buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
__buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
p_src
,
0
,
src_coord
.
GetOffset
());
p_src
,
src_coord
.
GetOffset
()
,
0
);
#else
#else
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
...
@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
__buffer_store
<
DstData
,
DstDataPerAccess
>
(
__buffer_store
<
DstData
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
p_dst
,
p_dst
,
0
,
dst_coord
.
GetOffset
()
,
dst_coord
.
GetOffset
()
);
0
);
#else
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
...
@@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
auto
src_coord
=
const
auto
src_coord
=
src_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
src_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
// this is src compile-time offset
// this is src compile-time offset
#if 0
// TODO: is this good implementation?
// TODO: is this good implementation?
const index_t src_linear_offset =
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#else
const
index_t
src_linear_offset
=
SrcDesc
::
CalculateOffset
(
linear_dim_data_steps
+
scalar_id
);
#endif
// Check src vector's padding situation, only check the first data in
// Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in
// this src vector. It's user's responsiblity to make sure all data in
...
@@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
auto
dst_coord
=
const
auto
dst_coord
=
dst_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
dst_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
// this is dst compile-time offset
// this is dst compile-time offset
#if 1
// TODO: is this good implementation?
// TODO: is this good implementation?
const
index_t
dst_linear_offset
=
const
index_t
dst_linear_offset
=
dst_coord
.
GetOffset
()
-
dst_nonlinear_coord
.
GetOffset
();
dst_coord
.
GetOffset
()
-
dst_nonlinear_coord
.
GetOffset
();
#else
const
index_t
dst_linear_offset
=
DstDesc
::
CalculateOffset
(
linear_dim_data_steps
+
scalar_id
);
#endif
// Check dst vector's padding situation, only check the first data in
// Check dst vector's padding situation, only check the first data in
// this dst vector. It's user's responsiblity to make sure all data in
// this dst vector. It's user's responsiblity to make sure all data in
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
ebe38f3d
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC
1
#define CK_USE_AMD_INTRINSIC
0
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
...
driver/src/driver.cpp
View file @
ebe38f3d
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
0
#elif
1
// 1x7 filter, 0x3 pad, 17x17 input
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
...
@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
...
@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment