Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
9d99a580
"tests/git@developer.sourcefind.cn:OpenDAS/mmcv.git" did not exist on "8eae77793fdb0c7e5df61db1d2cf368da0cbb86a"
Commit
9d99a580
authored
Aug 07, 2019
by
Chao Liu
Browse files
adding ThreadwiseGenericTensorSliceCopy_v1r2
parent
1b3c2e40
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
278 additions
and
50 deletions
+278
-50
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+15
-5
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+45
-24
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+213
-16
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+5
-5
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
9d99a580
...
@@ -449,18 +449,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -449,18 +449,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{},
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
Number<1>{});
#elif
1
#elif
0
ThreadwiseGenericTensorSliceCopy_v1
<
ThreadwiseGenericTensorSliceCopy_v1
r1
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
()),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
0
,
7
,
0
,
7
,
1
,
1
,
1
>
(
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
}
)
1
>
(
make_zero_array
<
index_t
,
8
>
(),
make_zero_array
<
index_t
,
8
>
()
)
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#elif 1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
7
,
1
,
1
>
(
make_zero_array
<
index_t
,
8
>
(),
make_zero_array
<
index_t
,
8
>
())
.
Run_non_static
(
p_out_thread
,
p_out_thread_on_global
);
#elif 0
#elif 0
ThreadwiseGenericTensorSliceCopy_v2
<
ThreadwiseGenericTensorSliceCopy_v2
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
9d99a580
...
@@ -245,7 +245,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -245,7 +245,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
// of BlockwiseGenericTensorSliceCopy_v1
#if 0
// debug
#if 0
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
p_src + src_offset + mThreadSrcOffset,
p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>(),
...
@@ -255,18 +255,28 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -255,18 +255,28 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_sub_tensor_lengths,
thread_sub_tensor_lengths,
SrcDimAccessOrder{},
SrcDimAccessOrder{},
Number<SrcDataPerAccess>{});
Number<SrcDataPerAccess>{});
#el
se
#el
if
0
ThreadwiseGenericTensorSliceCopy_v1
<
SrcDesc
,
ThreadwiseGenericTensorSliceCopy_v1
r1
<
decltype
(
thread_buffer_d
esc
)
,
SrcD
esc
,
SubLengths
,
decltype
(
thread_buffer_desc
)
,
S
rcDimAccessOrder
,
S
ubLengths
,
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
,
0
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
0
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
()
,
SrcDataPerAccess
,
make_zero_array
<
index_t
,
nDim
>
())
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
#elif 1
ThreadwiseGenericTensorSliceCopy_v1r2
<
SrcDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run_non_static
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
#endif
#endif
});
});
}
}
...
@@ -312,7 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -312,7 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// By setting SubLengths = 1 at the merged dimension, this is always true;
// By setting SubLengths = 1 at the merged dimension, this is always true;
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
// special care in implementation is needed
#if 0
// debug
#if 0
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
p_buffer + buffer_offset,
p_buffer + buffer_offset,
make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>(),
...
@@ -322,18 +332,29 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -322,18 +332,29 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_sub_tensor_lengths,
thread_sub_tensor_lengths,
DstDimAccessOrder{},
DstDimAccessOrder{},
Number<DstDataPerAccess>{});
Number<DstDataPerAccess>{});
#else
#elif
0
ThreadwiseGenericTensorSliceCopy_v1
<
decltype
(
thread_buffer_desc
),
ThreadwiseGenericTensorSliceCopy_v1r1
<
DstDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
DstDesc
,
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
,
SubLengths
,
DstDimAccessOrder
,
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
,
0
,
DstDimAccessOrder
,
DstVectorAccessDim
,
0
,
1
,
DstVectorAccessDim
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
1
,
make_zero_array
<
index_t
,
nDim
>
())
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
#elif 1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
thread_buffer_desc
),
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run_non_static
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
#endif
#endif
});
});
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
9d99a580
...
@@ -106,7 +106,15 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
...
@@ -106,7 +106,15 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
#endif
#endif
}
}
#if 1
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
class
SrcDesc
,
template
<
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SliceLengths
,
...
@@ -116,12 +124,12 @@ template <class SrcDesc,
...
@@ -116,12 +124,12 @@ template <class SrcDesc,
index_t
DstVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1
struct
ThreadwiseGenericTensorSliceCopy_v1
r1
{
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
r1
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
...
@@ -145,7 +153,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -145,7 +153,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
[
&
](
auto
fwd
)
{
static_assert
(
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
s
()[
SrcV
ector
A
ccess
D
im
]
==
1
||
SrcDataPerAccess
==
1
),
(
fwd
(
SrcDesc
{}).
GetStride
(
src_v
ector
_a
ccess
_d
im
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
"wrong! vectorized access is allowed only if stride == 1"
);
})
})
.
Else
([
&
](
auto
fwd
)
{
.
Else
([
&
](
auto
fwd
)
{
...
@@ -158,7 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -158,7 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
[
&
](
auto
fwd
)
{
static_assert
(
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
s
()[
DstV
ector
A
ccess
D
im
]
==
1
||
DstDataPerAccess
==
1
),
(
fwd
(
DstDesc
{}).
GetStride
(
dst_v
ector
_a
ccess
_d
im
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
"wrong! vectorized access is allowed only if stride == 1"
);
})
})
.
Else
([
&
](
auto
fwd
)
{
.
Else
([
&
](
auto
fwd
)
{
...
@@ -169,9 +177,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -169,9 +177,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1
});
});
}
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
()
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
r1
()
:
ThreadwiseGenericTensorSliceCopy_v1
(
make_zero_array
<
index_t
,
nDim
>
(),
:
ThreadwiseGenericTensorSliceCopy_v1
r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
make_zero_array
<
index_t
,
nDim
>
())
{
{
}
}
...
@@ -205,12 +213,12 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -205,12 +213,12 @@ struct ThreadwiseGenericTensorSliceCopy_v1
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
static_ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
static_ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
constexpr
auto
src_data_id
=
src_access_id
.
Modify
(
constexpr
auto
src_data_
begin_
id
=
src_access_id
.
Modify
(
src_vector_access_dim
,
src_vector_access_dim
,
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
index_t
src_offset
=
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_id
);
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_
begin_
id
);
// load vector from src
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
...
@@ -222,7 +230,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -222,7 +230,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
i
);
i
);
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_id
+
scalar_id
);
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_
begin_
id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
...
@@ -241,7 +249,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -241,7 +249,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
static_ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
static_ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
constexpr
auto
dst_data_id
=
dst_access_id
.
Modify
(
constexpr
auto
dst_data_
begin_
id
=
dst_access_id
.
Modify
(
dst_vector_access_dim
,
dst_vector_access_dim
,
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
...
@@ -254,13 +262,13 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -254,13 +262,13 @@ struct ThreadwiseGenericTensorSliceCopy_v1
i
);
i
);
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_id
+
scalar_id
);
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_
begin_
id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
});
const
index_t
dst_offset
=
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_id
);
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_
begin_
id
);
// store vector into dst
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
...
@@ -272,7 +280,196 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -272,7 +280,196 @@ struct ThreadwiseGenericTensorSliceCopy_v1
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
};
#endif
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
()
:
ThreadwiseGenericTensorSliceCopy_v1r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
static_ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
constexpr
auto
long_vector_data_begin_id
=
long_vector_access_id
.
Modify
(
vector_access_dim
,
long_vector_access_id
[
vector_access_dim
]
*
long_vector_size
);
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
static_for
<
0
,
long_vector_size
/
src_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
constexpr
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
});
// store data from the long-vector buffer to dst
static_for
<
0
,
long_vector_size
/
dst_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
dst_data_per_access
);
constexpr
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
});
});
}
template
<
class
TData
>
__device__
void
Run_non_static
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
});
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
template
<
class
SrcDesc
,
template
<
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
9d99a580
...
@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
B
=
(
N
*
Ho
*
Wo
)
/
(
N1
*
N2
);
constexpr
index_t
B
=
(
N
*
Ho
*
Wo
)
/
(
N1
*
N2
);
#if
1
#if
0
// each thread hold 64 data
// each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BlockSize = 256;
...
@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
4
,
1
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
16
,
1
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
4
,
4
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
4
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
1
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
2
,
2
>
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
2
,
2
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
64
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
64
>
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment