Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
5636576f
Commit
5636576f
authored
Aug 07, 2019
by
Chao Liu
Browse files
bug fix in ford, forgot to reorder lengths
parent
9d99a580
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
24 additions
and
28 deletions
+24
-28
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+7
-6
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+9
-18
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+1
-0
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+1
-0
composable_kernel/include/utility/functional3.hpp
composable_kernel/include/utility/functional3.hpp
+4
-2
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
5636576f
...
@@ -470,7 +470,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -470,7 +470,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
7
,
7
,
1
,
1
,
1
>
(
make_zero_array
<
index_t
,
8
>
(),
make_zero_array
<
index_t
,
8
>
())
1
>
(
make_zero_array
<
index_t
,
8
>
(),
make_zero_array
<
index_t
,
8
>
())
.
Run
_non_static
(
p_out_thread
,
p_out_thread_on_global
);
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#elif 0
#elif 0
ThreadwiseGenericTensorSliceCopy_v2
<
ThreadwiseGenericTensorSliceCopy_v2
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
5636576f
...
@@ -276,7 +276,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -276,7 +276,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
SrcDataPerAccess
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
make_zero_array
<
index_t
,
nDim
>
())
.
Run
_non_static
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
#endif
#endif
});
});
}
}
...
@@ -318,10 +318,11 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -318,10 +318,11 @@ struct BlockwiseGenericTensorSliceCopy_v1
// By position the origin of the per-thread window at the point, where multi-index
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// is assuming each thread is copy a noraml (not merged) tensor.
// User need to guarantee this is true.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// By setting SubLengths = 1 at the merged dimension, this is always true;
// that constains multiple original dimensions, the length of the last original
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// special care in implementation is needed
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
#if 0
#if 0
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
p_buffer + buffer_offset,
p_buffer + buffer_offset,
...
@@ -354,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -354,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
1
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
make_zero_array
<
index_t
,
nDim
>
())
.
Run
_non_static
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
#endif
#endif
});
});
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
5636576f
...
@@ -10,6 +10,10 @@
...
@@ -10,6 +10,10 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#endif
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
namespace
ck
{
namespace
ck
{
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
...
@@ -369,8 +373,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
...
@@ -369,8 +373,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
static_ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
// data id w.r.t slicing-window
constexpr
auto
long_vector_data_begin_id
=
long_vector_access_id
.
Modify
(
constexpr
auto
long_vector_data_begin_id
=
long_vector_access_id
.
Modify
(
vector_access_dim
,
long_vector_access_id
[
vector_access_dim
]
*
long_vector_size
);
vector_access_dim
,
long_vector_access_id
[
vector_access_dim
]
*
long_vector_size
);
...
@@ -406,26 +412,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
...
@@ -406,26 +412,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
});
});
});
});
}
#else
template
<
class
TData
>
__device__
void
Run_non_static
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
[
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_data_begin_id
(
vector_access_dim
)
=
...
@@ -464,6 +454,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
...
@@ -464,6 +454,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
}
});
});
#endif
}
}
private:
private:
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
5636576f
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
namespace ck {
namespace ck {
...
...
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
5636576f
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
namespace ck {
namespace ck {
...
...
composable_kernel/include/utility/functional3.hpp
View file @
5636576f
...
@@ -135,9 +135,11 @@ struct ford
...
@@ -135,9 +135,11 @@ struct ford
template
<
class
F
>
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
{
for
(
index_t
i
=
0
;
i
<
Lengths
::
Front
();
++
i
)
constexpr
auto
ordered_lengths
=
Lengths
::
ReorderGivenNew2Old
(
Orders
{});
for
(
index_t
i
=
0
;
i
<
ordered_lengths
.
Front
();
++
i
)
{
{
ford_impl
<
decltype
(
L
engths
::
PopFront
()),
Orders
>
{}(
f
,
Array
<
index_t
,
1
>
{
i
});
ford_impl
<
decltype
(
ordered_l
engths
.
PopFront
()),
Orders
>
{}(
f
,
Array
<
index_t
,
1
>
{
i
});
}
}
}
}
};
};
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
5636576f
...
@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
B
=
(
N
*
Ho
*
Wo
)
/
(
N1
*
N2
);
constexpr
index_t
B
=
(
N
*
Ho
*
Wo
)
/
(
N1
*
N2
);
#if
0
#if
1
// each thread hold 64 data
// each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment