Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
44ddcdcb
Commit
44ddcdcb
authored
Jan 18, 2021
by
Chao Liu
Browse files
adding vector load
parent
c1ed17f8
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
507 additions
and
143 deletions
+507
-143
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+167
-6
composable_kernel/include/tensor_description/multi_index.hpp
composable_kernel/include/tensor_description/multi_index.hpp
+11
-0
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+252
-126
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+1
-1
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+73
-7
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
44ddcdcb
...
@@ -502,11 +502,154 @@ struct DynamicMerge
...
@@ -502,11 +502,154 @@ struct DynamicMerge
typename
LowIdx
,
typename
LowIdx
,
typename
UpIdx
,
typename
UpIdx
,
index_t
Hack
>
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
void
UpdateLowerIndex_1
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_new
,
const
UpIdx
&
/* idx_up_new */
,
Number
<
Hack
>
)
const
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
idx_low_length_plus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
]);
idx_low_length_plus_idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
]);
});
#endif
if
constexpr
(
Hack
==
1
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_low
+=
idx_diff_low
;
}
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
borrow
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low
[
i
]
-
borrow
;
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
-=
borrow
;
borrow
=
do_borrow
?
1
:
0
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
borrow
;
idx_low
+=
idx_diff_low
;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
carry
=
do_borrow
?
-
1
:
carry
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_low
+=
idx_diff_low
;
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex_2
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
const
UpIdx
&
/* idx_up_new */
,
Number
<
Hack
>
)
const
{
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
...
@@ -611,6 +754,24 @@ struct DynamicMerge
...
@@ -611,6 +754,24 @@ struct DynamicMerge
}
}
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_new
,
Number
<
Hack
>
)
const
{
#if 1
UpdateLowerIndex_1
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
#else
UpdateLowerIndex_2
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
#endif
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -624,7 +785,7 @@ struct DynamicMerge
...
@@ -624,7 +785,7 @@ struct DynamicMerge
{
{
return
true
;
return
true
;
}
}
};
};
// namespace ck
template
<
index_t
NDimUp
,
bool
Use24BitIntegerCalculation
=
false
>
template
<
index_t
NDimUp
,
bool
Use24BitIntegerCalculation
=
false
>
struct
DynamicUnMerge
struct
DynamicUnMerge
...
...
composable_kernel/include/tensor_description/multi_index.hpp
View file @
44ddcdcb
...
@@ -152,6 +152,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
...
@@ -152,6 +152,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
return
r
;
return
r
;
}
}
// MultiIndex = index_t * MultiIndex
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
operator
*
(
index_t
a
,
const
Tuple
<
Xs
...
>&
x
)
{
constexpr
index_t
NSize
=
sizeof
...(
Xs
);
Tuple
<
Xs
...
>
r
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
i
)
{
r
(
i
)
=
a
*
x
[
i
];
});
return
r
;
}
#endif
#endif
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
44ddcdcb
...
@@ -735,74 +735,148 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -735,74 +735,148 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
{
// hardcoded for 2D
// TODO implemente N-D
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
"wrong! hardcoded for 2D tensor"
);
// hardcoded for 2D
constexpr
auto
src_scalar_per_access
=
[
&
]()
{
// TODO implemente N-D
Index
src_scalar_per_access
;
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
if
constexpr
(
i
==
SrcVectorDim
)
const
auto
src_step_m1_0
=
{
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
*
SrcScalarStrideInVector
;
}
else
{
src_scalar_per_access
(
i
)
=
1
;
}
});
constexpr
index_t
Len0
=
SliceLengths
{}[
0
]
;
return
src_scalar_per_access
;
constexpr
index_t
Len1
=
SliceLengths
{}[
1
]
;
}()
;
static_for
<
0
,
Len0
,
1
>
{}([
&
](
auto
iter0
)
{
constexpr
auto
src_scalar_step_in_vector
=
[
&
]()
{
static_for
<
0
,
Len1
,
1
>
{}([
&
](
auto
iter1
)
{
Index
src_scalar_step_in_vector
;
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
index_t
i0
=
iter0
.
value
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
Len1
-
iter1
.
value
-
1
;
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_step_in_vector
(
i
)
=
1
;
}
else
{
src_scalar_step_in_vector
(
i
)
=
0
;
}
});
// do work
return
src_scalar_step_in_vector
;
constexpr
index_t
buffer_offset
=
}();
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
));
// hardcoding for buffer_load
constexpr
auto
access_lengths
=
[
&
]()
{
// TODO refactor transfer_data() to encapsulate this
Index
access_lengths
;
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
buffer_
(
Number
<
buffer_offset
>
{})
=
amd_buffer_load
<
SrcData
,
1
>
(
static_for
<
0
,
nDim
,
1
>
{}(
p_src
,
[
&
](
auto
i
)
{
access_lengths
(
i
)
=
SliceLengths
{}[
i
]
/
src_scalar_per_access
[
i
];
});
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
());
// move dim1 iterator
return
access_lengths
;
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
}();
{
if
constexpr
(
forward_dim1
)
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
{
const
auto
src_step_0_p1
=
move_dynamic_tensor_coordinate
(
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
const
auto
src_step_0_m1
=
}
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
else
{
const
auto
src_step_p1_0
=
move_dynamic_tensor_coordinate
(
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
const
auto
src_step_m1_0
=
}
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
}
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_for
<
0
,
access_lengths
[
I0
],
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
access_lengths
[
I1
],
1
>
{}([
&
](
auto
iter1
)
{
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
access_lengths
[
I1
]
-
iter1
.
value
-
1
;
// do work
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
#if 1 // only works for SrcScalarPerVector == 1
auto
src_data
=
amd_buffer_load
<
SrcData
,
1
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
));
buffer_
(
Number
<
buffer_offset
>
{})
=
is_valid
?
src_data
:
SrcData
{
0
};
#elif 1 // only works for SrcScalarPerVector == 1
auto
src_data
=
amd_buffer_load
<
SrcData
,
1
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
);
buffer_
(
Number
<
buffer_offset
>
{})
=
is_valid
?
src_data
:
SrcData
{
0
};
#else
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
src_vector
.
Vector
()
=
is_valid
?
src_vector
.
Vector
()
:
SrcVectorType
{
0
};
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
+
i
*
src_scalar_step_in_vector
);
// TODO: can buffe_ use vector access?
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
});
#endif
// move dim
0
iterator
// move dim
1
iterator
if
constexpr
(
iter
0
.
value
<
Len0
-
1
)
if
constexpr
(
iter
1
.
value
<
access_lengths
[
I1
]
-
1
)
{
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
}
}
}
});
});
}
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
access_lengths
[
I0
]
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
}
});
// move src coordinate back to its slice origin
// move src coordinate back to its slice origin
if
constexpr
(
SrcResetCoordinateAfterRun
)
if
constexpr
(
SrcResetCoordinateAfterRun
)
...
@@ -893,13 +967,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -893,13 +967,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
{
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
"wrong! hardcoded for 2D tensor"
);
// hardcoded for 2D
constexpr
auto
src_scalar_per_access
=
[
&
]()
{
// TODO implemente N-D
Index
src_scalar_per_access
;
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
*
SrcScalarStrideInVector
;
}
else
{
src_scalar_per_access
(
i
)
=
1
;
}
});
return
src_scalar_per_access
;
}();
constexpr
auto
src_scalar_step_in_vector
=
[
&
]()
{
Index
src_scalar_step_in_vector
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_step_in_vector
(
i
)
=
1
;
}
else
{
src_scalar_step_in_vector
(
i
)
=
0
;
}
});
return
src_scalar_step_in_vector
;
}();
constexpr
auto
access_lengths
=
[
&
]()
{
Index
access_lengths
;
static_for
<
0
,
nDim
,
1
>
{}(
[
&
](
auto
i
)
{
access_lengths
(
i
)
=
SliceLengths
{}[
i
]
/
src_scalar_per_access
[
i
];
});
return
access_lengths
;
}();
#if 0 // hack
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
const auto src_step_0_p1 =
...
@@ -911,91 +1026,102 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -911,91 +1026,102 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
#elif
0
// for padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 1
#elif 1
// for padded input tensor
// for non-padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 1
// for non-padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#endif
#endif
constexpr
index_t
Len0
=
SliceLengths
{}[
0
]
;
constexpr
auto
I0
=
Number
<
0
>
{}
;
constexpr
index_t
Len1
=
SliceLengths
{}[
1
]
;
constexpr
auto
I1
=
Number
<
1
>
{}
;
static_for
<
0
,
Len0
,
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
access_lengths
[
I0
]
,
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
Len1
,
1
>
{}([
&
](
auto
iter1
)
{
static_for
<
0
,
access_lengths
[
I1
]
,
1
>
{}([
&
](
auto
iter1
)
{
// step direction
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
Len1
-
iter1
.
value
-
1
;
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
access_lengths
[
I1
]
-
iter1
.
value
-
1
;
// do work
// do work
constexpr
index_t
buffer_offset
=
// hardcoding for buffer_load
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
));
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
// hardcoding for buffer_load
#if 1 // only works for SrcScalarPerVector == 1
// TODO refactor transfer_data() to encapsulate this
auto
src_data
=
amd_buffer_load
<
SrcData
,
1
>
(
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
"wrong! hardcoded to use buffer_load, src must be global mem"
);
#if 0 // debug
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
src_desc
,
src_slice_origin_
);
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize());
#else
SrcData
tmp
=
amd_buffer_load
<
SrcData
,
1
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
constexpr
index_t
buffer_offset
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
);
src_desc
,
src_slice_origin_
);
buffer_
(
Number
<
buffer_offset
>
{})
=
is_valid
?
tmp
:
SrcData
{
0
};
buffer_
(
Number
<
buffer_offset
>
{})
=
is_valid
?
src_data
:
SrcData
{
0
};
#endif
#elif 1
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
// move dim1 iterator
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
{
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
if
constexpr
(
forward_dim1
)
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
{
move_dynamic_tensor_coordinate
(
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
src_desc
,
src_slice_origin_
);
}
else
src_vector
.
Vector
()
=
is_valid
?
src_vector
.
Vector
()
:
SrcVectorType
{
0
};
{
move_dynamic_tensor_coordinate
(
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
}
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
+
}
i
*
src_scalar_step_in_vector
);
// TODO: can buffe_ use vector access?
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
});
#endif
// move dim
0
iterator
// move dim
1
iterator
if
constexpr
(
iter
0
.
value
<
Len0
-
1
)
if
constexpr
(
iter
1
.
value
<
access_lengths
[
I1
]
-
1
)
{
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
}
}
}
});
});
}
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
access_lengths
[
I0
]
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
}
});
// move src coordinate back to its slice origin
// move src coordinate back to its slice origin
if
constexpr
(
SrcResetCoordinateAfterRun
)
if
constexpr
(
SrcResetCoordinateAfterRun
)
...
@@ -1063,7 +1189,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1063,7 +1189,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack
#if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
src_desc, adjusted_step_idx);
#elif
1
#elif
0
// for padded input tensor
// for padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
44ddcdcb
...
@@ -87,7 +87,7 @@
...
@@ -87,7 +87,7 @@
// thread-invariant, otherwise it's a bug
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
1
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
0
#endif
#endif
// workaround: put all workaround here
// workaround: put all workaround here
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
44ddcdcb
...
@@ -182,11 +182,28 @@ struct vector_type<float, 1>
...
@@ -182,11 +182,28 @@ struct vector_type<float, 1>
{
{
using MemoryType = float;
using MemoryType = float;
float data_;
__host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ constexpr const auto& Vector() const { return data_; }
__host__ __device__ constexpr auto& Vector() { return data_; }
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, float s,
Number<I>)
__host__ __device__
constexpr const auto& operator[](
Number<I>)
const
{
{
static_assert(I < 1, "wrong");
static_assert(I == 0, "wrong!");
*(reinterpret_cast<float*>(&v) + I) = s;
return data_;
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I == 0, "wrong!");
return data_;
}
}
};
};
...
@@ -222,13 +239,62 @@ struct vector_type<float, 4>
...
@@ -222,13 +239,62 @@ struct vector_type<float, 4>
{
{
using MemoryType = float4_t;
using MemoryType = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
union
{
float4_t v;
float s0, s1, s2, s3;
} data_;
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.v; }
__host__ __device__ constexpr auto& Vector() { return data_.v; }
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, float s,
Number<I>)
__host__ __device__
constexpr const auto& operator[](
Number<I>)
const
{
{
static_assert(I < 4, "wrong");
static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<float*>(&v) + I) = s;
if constexpr(I == 0)
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I >= 0 && I < 4, "wrong!");
if constexpr(I == 0)
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
}
}
};
};
...
...
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
44ddcdcb
...
@@ -929,7 +929,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -929,7 +929,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
2
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
2
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif
1
#elif
0
// cdata = 64, BlockSize = 64, 64x64x3
// cdata = 64, BlockSize = 64, 64x64x3
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BlockSize
=
64
;
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
44ddcdcb
...
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
conv_driver
=
constexpr
auto
conv_driver
=
#if
1
#if
0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
...
driver/src/conv_driver.cpp
View file @
44ddcdcb
...
@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
...
@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if
0
#if
1
// 3x3, 35x35, stride 2
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
C
=
192
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment