Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
43c22b57
"vscode:/vscode.git/clone" did not exist on "065f251766d5fa307f18814bfbb862f180755fe1"
Commit
43c22b57
authored
Feb 25, 2022
by
Jianfeng yan
Browse files
threadwise_copy_v6r1/v6r2/v6r3 using space-filling curve start to work
parent
a9b1061c
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
773 additions
and
9 deletions
+773
-9
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v6r1.hpp
...tensor_operation/blockwise_tensor_slice_transfer_v6r1.hpp
+2
-1
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v6r2.hpp
...tensor_operation/blockwise_tensor_slice_transfer_v6r2.hpp
+2
-1
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v6r3.hpp
...tensor_operation/blockwise_tensor_slice_transfer_v6r3.hpp
+2
-1
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+1
-1
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_sfcurve.hpp
...or_operation/threadwise_tensor_slice_transfer_sfcurve.hpp
+1
-5
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1_sfcurve.hpp
...eration/threadwise_tensor_slice_transfer_v6r1_sfcurve.hpp
+201
-0
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2_sfcurve.hpp
...eration/threadwise_tensor_slice_transfer_v6r2_sfcurve.hpp
+253
-0
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3_sfcurve.hpp
...eration/threadwise_tensor_slice_transfer_v6r3_sfcurve.hpp
+303
-0
composable_kernel/include/utility/tensor_space_filling_curve.hpp
...ble_kernel/include/utility/tensor_space_filling_curve.hpp
+4
-0
example/conv2d_fwd_xdl_sfcurve/conv2d_fwd_xdl_sfcurve.cpp
example/conv2d_fwd_xdl_sfcurve/conv2d_fwd_xdl_sfcurve.cpp
+4
-0
No files found.
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v6r1.hpp
View file @
43c22b57
...
@@ -5,7 +5,8 @@
...
@@ -5,7 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r1.hpp"
// #include "threadwise_tensor_slice_transfer_v6r1.hpp"
#include "threadwise_tensor_slice_transfer_v6r1_sfcurve.hpp"
namespace
ck
{
namespace
ck
{
...
...
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v6r2.hpp
View file @
43c22b57
...
@@ -5,7 +5,8 @@
...
@@ -5,7 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r2.hpp"
// #include "threadwise_tensor_slice_transfer_v6r2.hpp"
#include "threadwise_tensor_slice_transfer_v6r2_sfcurve.hpp"
namespace
ck
{
namespace
ck
{
...
...
composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v6r3.hpp
View file @
43c22b57
...
@@ -5,7 +5,8 @@
...
@@ -5,7 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r3.hpp"
// #include "threadwise_tensor_slice_transfer_v6r3.hpp"
#include "threadwise_tensor_slice_transfer_v6r3_sfcurve.hpp"
namespace
ck
{
namespace
ck
{
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
View file @
43c22b57
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "blockwise_tensor_slice_transfer_v4r1.hpp"
#include "blockwise_tensor_slice_transfer_v4r1.hpp"
#include "threadwise_tensor_slice_transfer_
using_space_filling_
curve.hpp"
#include "threadwise_tensor_slice_transfer_
sf
curve.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
namespace
ck
{
namespace
ck
{
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_
using_space_filling_
curve.hpp
→
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_
sf
curve.hpp
View file @
43c22b57
...
@@ -38,8 +38,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
...
@@ -38,8 +38,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
(
__device__
constexpr
ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
(
const
DstDesc
&
dst_desc
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
,
const
Index
&
dst_slice_origin_idx
,
...
@@ -166,8 +164,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
...
@@ -166,8 +164,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
__device__
static
constexpr
auto
GetDstCoordinateResetStep
()
__device__
static
constexpr
auto
GetDstCoordinateResetStep
()
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
...
@@ -177,7 +173,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
...
@@ -177,7 +173,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
constexpr
auto
reset_step
=
constexpr
auto
reset_step
=
SpaceFillingCurve
::
GetStepBetween
(
Number
<
num_accesses
-
1
>
{},
I0
);
SpaceFillingCurve
::
GetStepBetween
(
Number
<
num_accesses
-
1
>
{},
Number
<
0
>
{}
);
return
reset_step
;
return
reset_step
;
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1_sfcurve.hpp
0 → 100644
View file @
43c22b57
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_SFCURVE_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_SFCURVE_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
namespace
ck
{
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions:
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
// instead
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
// tensor coordinate instead
// 3. Don't use a pointer to VGPR buffer, use vector instead
// Assume:
// 1. src_desc and dst_desc are not known at compile-time
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
template
<
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
ElementwiseOperation
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorDim
,
index_t
ScalarPerVector
,
InMemoryDataOperationEnum_t
DstInMemOp
,
bool
SrcResetCoordinateAfterRun
,
bool
DstResetCoordinateAfterRun
>
struct
ThreadwiseTensorSliceTransfer_v6r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
static
constexpr
auto
I0
=
Number
<
0
>
{};
__device__
constexpr
ThreadwiseTensorSliceTransfer_v6r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
,
const
ElementwiseOperation
&
element_op
)
:
src_coord_
(
make_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_coord_
(
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin
)),
element_op_
(
element_op
)
{
static_assert
(
SliceLengths
::
At
(
Number
<
VectorDim
>
{})
%
ScalarPerVector
==
0
,
"wrong! cannot evenly divide"
);
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
{
src_coord_
=
make_tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_coord_
=
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
template
<
typename
SrcBuffer
,
typename
DstBuffer
>
__device__
void
Run
(
const
SrcDesc
&
src_desc
,
const
SrcBuffer
&
src_buf
,
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
VectorDim
,
ScalarPerVector
>
{},
Number
<
nDim
>
{});
using
SpaceFillingCurve
=
SpaceFillingCurve
<
SliceLengths
,
DimAccessOrder
,
remove_cv_t
<
decltype
(
scalar_per_access
)
>>
;
// loop over space-filling curve
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
static_for
<
0
,
num_accesses
,
1
>
{}([
&
](
auto
idx_1d
)
{
using
src_vector_type
=
vector_type_maker_t
<
SrcData
,
ScalarPerVector
>
;
using
src_vector_t
=
typename
src_vector_type
::
type
;
using
dst_vector_type
=
vector_type_maker_t
<
DstData
,
ScalarPerVector
>
;
using
dst_vector_t
=
typename
dst_vector_type
::
type
;
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_coord_
);
// copy data from src_buf into src_vector_container
auto
src_vector_container
=
src_vector_type
{
src_buf
.
template
Get
<
src_vector_t
>(
src_coord_
.
GetOffset
(),
is_src_valid
)};
auto
dst_vector_container
=
dst_vector_type
{};
// apply pointwise operation
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
element_op_
(
dst_vector_container
.
template
AsType
<
DstData
>()(
i
),
src_vector_container
.
template
AsType
<
SrcData
>()[
i
]);
});
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
// copy data from dst_vector into dst_buf
dst_buf
.
template
Transfer
<
DstInMemOp
,
dst_vector_t
>(
dst_coord_
.
GetOffset
(),
is_dst_valid
,
dst_vector_container
.
template
AsType
<
dst_vector_t
>()[
I0
]);
// move coordinate
if
constexpr
(
idx_1d
.
value
!=
num_accesses
-
1
)
{
constexpr
auto
forward_step
=
SpaceFillingCurve
::
GetForwardStep
(
idx_1d
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
make_tensor_coordinate_step
(
src_desc
,
forward_step
));
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
make_tensor_coordinate_step
(
dst_desc
,
forward_step
));
}
});
// move coordinate back to slice origin (or not)
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
const
auto
src_reset_step
=
make_tensor_coordinate_step
(
src_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
src_desc
,
src_coord_
,
src_reset_step
);
}
if
constexpr
(
DstResetCoordinateAfterRun
)
{
const
auto
dst_reset_step
=
make_tensor_coordinate_step
(
dst_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_step
);
}
}
__device__
static
constexpr
auto
GetCoordinateResetStep
()
{
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
VectorDim
,
ScalarPerVector
>
{},
Number
<
nDim
>
{});
using
SpaceFillingCurve
=
SpaceFillingCurve
<
SliceLengths
,
DimAccessOrder
,
remove_cv_t
<
decltype
(
scalar_per_access
)
>>
;
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
constexpr
auto
reset_step
=
SpaceFillingCurve
::
GetStepBetween
(
Number
<
num_accesses
-
1
>
{},
Number
<
0
>
{});
return
reset_step
;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
adjusted_step
);
}
private:
SrcCoord
src_coord_
;
DstCoord
dst_coord_
;
const
ElementwiseOperation
element_op_
;
};
// namespace ck
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2_sfcurve.hpp
0 → 100644
View file @
43c22b57
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_SFCURVE_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_SFCURVE_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
namespace
ck
{
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions:
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
// instead
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
// tensor coordinate instead
// 3. Don't use a pointer to VGPR buffer, use vector instead
// Assume:
// 1. src0_desc and dst_desc are not known at compile-time
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
template
<
typename
Src0Data
,
typename
Src1Data
,
typename
DstData
,
typename
Src0Desc
,
typename
Src1Desc
,
typename
DstDesc
,
typename
ElementwiseOperation
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorDim
,
index_t
ScalarPerVector
,
InMemoryDataOperationEnum_t
DstInMemOp
,
bool
Src0ResetCoordinateAfterRun
,
bool
Src1ResetCoordinateAfterRun
,
bool
DstResetCoordinateAfterRun
>
struct
ThreadwiseTensorSliceTransfer_v6r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Src0Coord
=
decltype
(
make_tensor_coordinate
(
Src0Desc
{},
Index
{}));
using
Src1Coord
=
decltype
(
make_tensor_coordinate
(
Src1Desc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
static
constexpr
auto
I0
=
Number
<
0
>
{};
__device__
constexpr
ThreadwiseTensorSliceTransfer_v6r2
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_slice_origin
,
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
,
const
ElementwiseOperation
&
element_op
)
:
src0_coord_
(
make_tensor_coordinate
(
src0_desc
,
src0_slice_origin
)),
src1_coord_
(
make_tensor_coordinate
(
src1_desc
,
src1_slice_origin
)),
dst_coord_
(
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin
)),
element_op_
(
element_op
)
{
static_assert
(
SliceLengths
::
At
(
Number
<
VectorDim
>
{})
%
ScalarPerVector
==
0
,
"wrong! cannot evenly divide"
);
}
__device__
void
SetSrc0SliceOrigin
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_slice_origin_idx
)
{
src0_coord_
=
make_tensor_coordinate
(
src0_desc
,
src0_slice_origin_idx
);
}
__device__
void
SetSrc1SliceOrigin
(
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_slice_origin_idx
)
{
src1_coord_
=
make_tensor_coordinate
(
src1_desc
,
src1_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_coord_
=
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
template
<
typename
Src0Buffer
,
typename
Src1Buffer
,
typename
DstBuffer
>
__device__
void
Run
(
const
Src0Desc
&
src0_desc
,
const
Src0Buffer
&
src0_buf
,
const
Src1Desc
&
src1_desc
,
const
Src1Buffer
&
src1_buf
,
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
VectorDim
,
ScalarPerVector
>
{},
Number
<
nDim
>
{});
using
SpaceFillingCurve
=
SpaceFillingCurve
<
SliceLengths
,
DimAccessOrder
,
remove_cv_t
<
decltype
(
scalar_per_access
)
>>
;
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
// loop over space-filling curve
static_for
<
0
,
num_accesses
,
1
>
{}([
&
](
auto
idx_1d
)
{
using
src0_vector_type
=
vector_type_maker_t
<
Src0Data
,
ScalarPerVector
>
;
using
src0_vector_t
=
typename
src0_vector_type
::
type
;
using
src1_vector_type
=
vector_type_maker_t
<
Src1Data
,
ScalarPerVector
>
;
using
src1_vector_t
=
typename
src1_vector_type
::
type
;
using
dst_vector_type
=
vector_type_maker_t
<
DstData
,
ScalarPerVector
>
;
using
dst_vector_t
=
typename
dst_vector_type
::
type
;
const
bool
is_src0_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src0_desc
,
src0_coord_
);
const
bool
is_src1_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src1_desc
,
src1_coord_
);
// copy data from src0_buf into src0_vector_container
auto
src0_vector_container
=
src0_vector_type
{
src0_buf
.
template
Get
<
src0_vector_t
>(
src0_coord_
.
GetOffset
(),
is_src0_valid
)};
auto
src1_vector_container
=
src1_vector_type
{
src1_buf
.
template
Get
<
src1_vector_t
>(
src1_coord_
.
GetOffset
(),
is_src1_valid
)};
auto
dst_vector_container
=
dst_vector_type
{};
// apply pointwise operation
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
element_op_
(
dst_vector_container
.
template
AsType
<
DstData
>()(
i
),
src0_vector_container
.
template
AsType
<
Src0Data
>()[
i
],
src1_vector_container
.
template
AsType
<
Src1Data
>()[
i
]);
});
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
// copy data from dst_vector into dst_buf
dst_buf
.
template
Transfer
<
DstInMemOp
,
dst_vector_t
>(
dst_coord_
.
GetOffset
(),
is_dst_valid
,
dst_vector_container
.
template
AsType
<
dst_vector_t
>()[
I0
]);
// move coordinate
if
constexpr
(
idx_1d
.
value
!=
num_accesses
-
1
)
{
constexpr
auto
forward_step
=
SpaceFillingCurve
::
GetForwardStep
(
idx_1d
);
move_tensor_coordinate
(
src0_desc
,
src0_coord_
,
make_tensor_coordinate_step
(
src0_desc
,
forward_step
));
move_tensor_coordinate
(
src1_desc
,
src1_coord_
,
make_tensor_coordinate_step
(
src1_desc
,
forward_step
));
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
make_tensor_coordinate_step
(
dst_desc
,
forward_step
));
}
});
// move coordinate back to slice origin (or not)
if
constexpr
(
Src0ResetCoordinateAfterRun
)
{
const
auto
src0_reset_step
=
make_tensor_coordinate_step
(
src0_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
src0_desc
,
src0_coord_
,
src0_reset_step
);
}
if
constexpr
(
Src1ResetCoordinateAfterRun
)
{
const
auto
src1_reset_step
=
make_tensor_coordinate_step
(
src1_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
src1_desc
,
src1_coord_
,
src1_reset_step
);
}
if
constexpr
(
DstResetCoordinateAfterRun
)
{
const
auto
dst_reset_step
=
make_tensor_coordinate_step
(
dst_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_step
);
}
}
__device__
static
constexpr
auto
GetCoordinateResetStep
()
{
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
VectorDim
,
ScalarPerVector
>
{},
Number
<
nDim
>
{});
using
SpaceFillingCurve
=
SpaceFillingCurve
<
SliceLengths
,
DimAccessOrder
,
remove_cv_t
<
decltype
(
scalar_per_access
)
>>
;
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
constexpr
auto
reset_step
=
SpaceFillingCurve
::
GetStepBetween
(
Number
<
num_accesses
-
1
>
{},
Number
<
0
>
{});
return
reset_step
;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrc0SliceWindow
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
Src0ResetCoordinateAfterRun
?
src0_slice_origin_step_idx
:
src0_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src0_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
src0_desc
,
src0_coord_
,
adjusted_step
);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrc1SliceWindow
(
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
Src1ResetCoordinateAfterRun
?
src1_slice_origin_step_idx
:
src1_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src1_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
src1_desc
,
src1_coord_
,
adjusted_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
adjusted_step
);
}
private:
Src0Coord
src0_coord_
;
Src1Coord
src1_coord_
;
DstCoord
dst_coord_
;
const
ElementwiseOperation
element_op_
;
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3_sfcurve.hpp
0 → 100644
View file @
43c22b57
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_SPCURVE_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_SPCURVE_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
namespace
ck
{
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions:
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
// instead
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
// tensor coordinate instead
// 3. Don't use a pointer to VGPR buffer, use vector instead
// Assume:
// 1. src0_desc and dst_desc are not known at compile-time
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
template
<
typename
Src0Data
,
typename
Src1Data
,
typename
Src2Data
,
typename
DstData
,
typename
Src0Desc
,
typename
Src1Desc
,
typename
Src2Desc
,
typename
DstDesc
,
typename
ElementwiseOperation
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorDim
,
index_t
ScalarPerVector
,
InMemoryDataOperationEnum_t
DstInMemOp
,
bool
Src0ResetCoordinateAfterRun
,
bool
Src1ResetCoordinateAfterRun
,
bool
Src2ResetCoordinateAfterRun
,
bool
DstResetCoordinateAfterRun
>
struct
ThreadwiseTensorSliceTransfer_v6r3
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Src0Coord
=
decltype
(
make_tensor_coordinate
(
Src0Desc
{},
Index
{}));
using
Src1Coord
=
decltype
(
make_tensor_coordinate
(
Src1Desc
{},
Index
{}));
using
Src2Coord
=
decltype
(
make_tensor_coordinate
(
Src2Desc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
static
constexpr
auto
I0
=
Number
<
0
>
{};
__device__
constexpr
ThreadwiseTensorSliceTransfer_v6r3
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_slice_origin
,
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_slice_origin
,
const
Src2Desc
&
src2_desc
,
const
Index
&
src2_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
,
const
ElementwiseOperation
&
element_op
)
:
src0_coord_
(
make_tensor_coordinate
(
src0_desc
,
src0_slice_origin
)),
src1_coord_
(
make_tensor_coordinate
(
src1_desc
,
src1_slice_origin
)),
src2_coord_
(
make_tensor_coordinate
(
src2_desc
,
src2_slice_origin
)),
dst_coord_
(
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin
)),
element_op_
(
element_op
)
{
static_assert
(
SliceLengths
::
At
(
Number
<
VectorDim
>
{})
%
ScalarPerVector
==
0
,
"wrong! cannot evenly divide"
);
}
__device__
void
SetSrc0SliceOrigin
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_slice_origin_idx
)
{
src0_coord_
=
make_tensor_coordinate
(
src0_desc
,
src0_slice_origin_idx
);
}
__device__
void
SetSrc1SliceOrigin
(
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_slice_origin_idx
)
{
src1_coord_
=
make_tensor_coordinate
(
src1_desc
,
src1_slice_origin_idx
);
}
__device__
void
SetSrc2SliceOrigin
(
const
Src2Desc
&
src2_desc
,
const
Index
&
src2_slice_origin_idx
)
{
src2_coord_
=
make_tensor_coordinate
(
src2_desc
,
src2_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_coord_
=
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
template
<
typename
Src0Buffer
,
typename
Src1Buffer
,
typename
Src2Buffer
,
typename
DstBuffer
>
__device__
void
Run
(
const
Src0Desc
&
src0_desc
,
const
Src0Buffer
&
src0_buf
,
const
Src1Desc
&
src1_desc
,
const
Src1Buffer
&
src1_buf
,
const
Src2Desc
&
src2_desc
,
const
Src2Buffer
&
src2_buf
,
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
VectorDim
,
ScalarPerVector
>
{},
Number
<
nDim
>
{});
using
SpaceFillingCurve
=
SpaceFillingCurve
<
SliceLengths
,
DimAccessOrder
,
remove_cv_t
<
decltype
(
scalar_per_access
)
>>
;
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
// loop over space-filling curve
static_for
<
0
,
num_accesses
,
1
>
{}([
&
](
auto
idx_1d
)
{
using
src0_vector_type
=
vector_type_maker_t
<
Src0Data
,
ScalarPerVector
>
;
using
src0_vector_t
=
typename
src0_vector_type
::
type
;
using
src1_vector_type
=
vector_type_maker_t
<
Src1Data
,
ScalarPerVector
>
;
using
src1_vector_t
=
typename
src1_vector_type
::
type
;
using
src2_vector_type
=
vector_type_maker_t
<
Src2Data
,
ScalarPerVector
>
;
using
src2_vector_t
=
typename
src2_vector_type
::
type
;
using
dst_vector_type
=
vector_type_maker_t
<
DstData
,
ScalarPerVector
>
;
using
dst_vector_t
=
typename
dst_vector_type
::
type
;
const
bool
is_src0_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src0_desc
,
src0_coord_
);
const
bool
is_src1_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src1_desc
,
src1_coord_
);
const
bool
is_src2_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src2_desc
,
src2_coord_
);
// copy data from src0_buf into src0_vector_container
auto
src0_vector_container
=
src0_vector_type
{
src0_buf
.
template
Get
<
src0_vector_t
>(
src0_coord_
.
GetOffset
(),
is_src0_valid
)};
auto
src1_vector_container
=
src1_vector_type
{
src1_buf
.
template
Get
<
src1_vector_t
>(
src1_coord_
.
GetOffset
(),
is_src1_valid
)};
auto
src2_vector_container
=
src2_vector_type
{
src2_buf
.
template
Get
<
src2_vector_t
>(
src2_coord_
.
GetOffset
(),
is_src2_valid
)};
auto
dst_vector_container
=
dst_vector_type
{};
// apply pointwise operation
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
element_op_
(
dst_vector_container
.
template
AsType
<
DstData
>()(
i
),
src0_vector_container
.
template
AsType
<
Src0Data
>()[
i
],
src1_vector_container
.
template
AsType
<
Src1Data
>()[
i
],
src2_vector_container
.
template
AsType
<
Src2Data
>()[
i
]);
});
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
dst_buf
.
template
Transfer
<
DstInMemOp
,
dst_vector_t
>(
dst_coord_
.
GetOffset
(),
is_dst_valid
,
dst_vector_container
.
template
AsType
<
dst_vector_t
>()[
I0
]);
// move coordinate
if
constexpr
(
idx_1d
.
value
!=
num_accesses
-
1
)
{
constexpr
auto
forward_step
=
SpaceFillingCurve
::
GetForwardStep
(
idx_1d
);
move_tensor_coordinate
(
src0_desc
,
src0_coord_
,
make_tensor_coordinate_step
(
src0_desc
,
forward_step
));
move_tensor_coordinate
(
src1_desc
,
src1_coord_
,
make_tensor_coordinate_step
(
src1_desc
,
forward_step
));
move_tensor_coordinate
(
src2_desc
,
src2_coord_
,
make_tensor_coordinate_step
(
src1_desc
,
forward_step
));
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
make_tensor_coordinate_step
(
dst_desc
,
forward_step
));
}
});
// move coordinate back to slice origin (or not)
if
constexpr
(
Src0ResetCoordinateAfterRun
)
{
const
auto
src0_reset_step
=
make_tensor_coordinate_step
(
src0_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
src0_desc
,
src0_coord_
,
src0_reset_step
);
}
if
constexpr
(
Src1ResetCoordinateAfterRun
)
{
const
auto
src1_reset_step
=
make_tensor_coordinate_step
(
src1_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
src1_desc
,
src1_coord_
,
src1_reset_step
);
}
if
constexpr
(
Src2ResetCoordinateAfterRun
)
{
const
auto
src2_reset_step
=
make_tensor_coordinate_step
(
src2_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
src2_desc
,
src2_coord_
,
src2_reset_step
);
}
if
constexpr
(
DstResetCoordinateAfterRun
)
{
const
auto
dst_reset_step
=
make_tensor_coordinate_step
(
dst_desc
,
GetCoordinateResetStep
());
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_step
);
}
}
__device__
static
constexpr
auto
GetCoordinateResetStep
()
{
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
VectorDim
,
ScalarPerVector
>
{},
Number
<
nDim
>
{});
using
SpaceFillingCurve
=
SpaceFillingCurve
<
SliceLengths
,
DimAccessOrder
,
remove_cv_t
<
decltype
(
scalar_per_access
)
>>
;
constexpr
auto
num_accesses
=
SpaceFillingCurve
::
GetNumOfAccess
();
constexpr
auto
reset_step
=
SpaceFillingCurve
::
GetStepBetween
(
Number
<
num_accesses
-
1
>
{},
Number
<
0
>
{});
return
reset_step
;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrc0SliceWindow
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
Src0ResetCoordinateAfterRun
?
src0_slice_origin_step_idx
:
src0_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src0_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
src0_desc
,
src0_coord_
,
adjusted_step
);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrc1SliceWindow
(
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
Src1ResetCoordinateAfterRun
?
src1_slice_origin_step_idx
:
src1_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src1_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
src1_desc
,
src1_coord_
,
adjusted_step
);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrc2SliceWindow
(
const
Src2Desc
&
src2_desc
,
const
Index
&
src2_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
Src2ResetCoordinateAfterRun
?
src2_slice_origin_step_idx
:
src2_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src2_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
src2_desc
,
src2_coord_
,
adjusted_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
adjusted_step
);
}
private:
Src0Coord
src0_coord_
;
Src1Coord
src1_coord_
;
Src2Coord
src2_coord_
;
DstCoord
dst_coord_
;
const
ElementwiseOperation
element_op_
;
};
}
// namespace ck
#endif
composable_kernel/include/utility/tensor_space_filling_curve.hpp
View file @
43c22b57
#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
#define TENSOR_SPACE_FILLING_CURVE_HPP
#include "math.hpp"
#include "math.hpp"
#include "sequence.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "sequence_helper.hpp"
...
@@ -139,3 +142,4 @@ struct SpaceFillingCurve
...
@@ -139,3 +142,4 @@ struct SpaceFillingCurve
};
};
}
// namespace ck
}
// namespace ck
#endif
example/conv2d_fwd_xdl_sfcurve/conv2d_fwd_xdl_sfcurve.cpp
View file @
43c22b57
...
@@ -211,6 +211,10 @@ int main(int argc, char* argv[])
...
@@ -211,6 +211,10 @@ int main(int argc, char* argv[])
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
{
memset
(
out_n_k_ho_wo_device_result
.
mData
.
data
(),
static_cast
<
OutDataType
>
(
0
),
out_n_k_ho_wo_device_result
.
mDesc
.
GetElementSpace
());
out_device_buf
.
ToDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
}
// do GEMM
// do GEMM
auto
conv
=
DeviceConvFwdInstance
{};
auto
conv
=
DeviceConvFwdInstance
{};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment