Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f2f35201
Commit
f2f35201
authored
Jan 11, 2021
by
Chao Liu
Browse files
add hack for merge transformation
parent
fa479ce4
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
883 additions
and
51 deletions
+883
-51
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+219
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+91
-25
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+141
-0
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+22
-22
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+343
-0
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+34
-1
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+32
-2
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
f2f35201
...
@@ -64,6 +64,20 @@ struct DynamicPassThrough
...
@@ -64,6 +64,20 @@ struct DynamicPassThrough
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -141,6 +155,20 @@ struct DynamicLeftPad
...
@@ -141,6 +155,20 @@ struct DynamicLeftPad
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -228,6 +256,20 @@ struct DynamicRightPad
...
@@ -228,6 +256,20 @@ struct DynamicRightPad
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -324,6 +366,20 @@ struct DynamicEmbed
...
@@ -324,6 +366,20 @@ struct DynamicEmbed
[
&
](
auto
i
)
{
idx_diff_low
(
Number
<
0
>
{})
+=
idx_diff_up
[
i
]
*
coefficients_
[
i
];
});
[
&
](
auto
i
)
{
idx_diff_low
(
Number
<
0
>
{})
+=
idx_diff_up
[
i
]
*
coefficients_
[
i
];
});
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
const
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -489,6 +545,141 @@ struct DynamicMerge
...
@@ -489,6 +545,141 @@ struct DynamicMerge
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
/* idx_up_old */
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
idx_low_length_plus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
]);
idx_low_length_plus_idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
]);
});
#endif
if
constexpr
(
Hack
==
1
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_borrow
?
-
1
:
carry
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
carry
=
do_borrow
?
-
1
:
carry
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -551,6 +742,20 @@ struct DynamicUnMerge
...
@@ -551,6 +742,20 @@ struct DynamicUnMerge
CalculateLowerIndex
(
idx_diff_low
,
idx_diff_up
);
CalculateLowerIndex
(
idx_diff_low
,
idx_diff_up
);
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
const
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -602,6 +807,20 @@ struct DynamicFreeze
...
@@ -602,6 +807,20 @@ struct DynamicFreeze
idx_diff_low
(
Number
<
0
>
{})
=
index_t
{
Number
<
0
>
{}};
idx_diff_low
(
Number
<
0
>
{})
=
index_t
{
Number
<
0
>
{}};
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
View file @
f2f35201
...
@@ -9,30 +9,13 @@ namespace ck {
...
@@ -9,30 +9,13 @@ namespace ck {
template
<
index_t
NDimHidden
,
typename
VisibleDimensionIds
>
template
<
index_t
NDimHidden
,
typename
VisibleDimensionIds
>
struct
DynamicTensorCoordinate
;
struct
DynamicTensorCoordinate
;
#if 0 // hack
template <index_t NTransform, index_t NDimVisible>
template <index_t NTransform, index_t NDimVisible>
#else
template
<
index_t
NTransform
,
index_t
NDimVisible
,
typename
HackCalculateLowerIndexDiff
>
#endif
struct
DynamicTensorCoordinateStep
;
struct
DynamicTensorCoordinateStep
;
template
<
typename
TensorDesc
,
typename
VisibleIndex
>
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate
(
const
TensorDesc
&
tensor_desc
,
const
VisibleIndex
&
idx_visible
);
template
<
typename
TensorDesc
,
typename
VisibleIndex
>
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate_step
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
);
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
__host__
__device__
constexpr
void
move_dynamic_tensor_coordinate
(
const
TensorDesc
&
tensor_desc
,
TensorCoord
&
coord
,
const
TensorCoordStep
&
coord_step
);
template
<
typename
TensorDesc
,
typename
TensorCoord
>
__host__
__device__
constexpr
bool
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
const
TensorDesc
&
tensor_desc
,
const
TensorCoord
&
coord
);
template
<
typename
TensorDesc
,
typename
TensorCoord
>
__host__
__device__
constexpr
bool
coordinate_has_valid_offset
(
const
TensorDesc
&
tensor_desc
,
const
TensorCoord
&
coord
);
// Transforms: Tuple<transforms...>
// Transforms: Tuple<transforms...>
// LowerDimensionIdss : Tuple<Sequence<...>, ...>
// LowerDimensionIdss : Tuple<Sequence<...>, ...>
// UpperDimensionIdss : Tuple<Sequence<...>, ...>
// UpperDimensionIdss : Tuple<Sequence<...>, ...>
...
@@ -74,10 +57,9 @@ struct DynamicTensorDescriptor
...
@@ -74,10 +57,9 @@ struct DynamicTensorDescriptor
constexpr
static
index_t
ndim_visible_
=
GetNumOfVisibleDimension
();
constexpr
static
index_t
ndim_visible_
=
GetNumOfVisibleDimension
();
constexpr
static
index_t
ndim_hidden_
=
GetNumOfHiddenDimension
();
constexpr
static
index_t
ndim_hidden_
=
GetNumOfHiddenDimension
();
using
VisibleIndex
=
MultiIndex
<
ndim_visible_
>
;
using
VisibleIndex
=
MultiIndex
<
ndim_visible_
>
;
using
HiddenIndex
=
MultiIndex
<
ndim_hidden_
>
;
using
HiddenIndex
=
MultiIndex
<
ndim_hidden_
>
;
using
Coordinate
=
DynamicTensorCoordinate
<
ndim_hidden_
,
VisibleDimensionIds
>
;
using
Coordinate
=
DynamicTensorCoordinate
<
ndim_hidden_
,
VisibleDimensionIds
>
;
using
CoordinateStep
=
DynamicTensorCoordinateStep
<
ntransform_
,
ndim_visible_
>
;
public:
public:
__host__
__device__
explicit
constexpr
DynamicTensorDescriptor
(
const
Transforms
&
transforms
,
__host__
__device__
explicit
constexpr
DynamicTensorDescriptor
(
const
Transforms
&
transforms
,
...
@@ -211,7 +193,11 @@ struct DynamicTensorCoordinate
...
@@ -211,7 +193,11 @@ struct DynamicTensorCoordinate
HiddenIndex
idx_hidden_
;
HiddenIndex
idx_hidden_
;
};
};
#if 0 // hack
template <index_t NTransform, index_t NDimVisible>
template <index_t NTransform, index_t NDimVisible>
#else
template
<
index_t
NTransform
,
index_t
NDimVisible
,
typename
HackCalculateLowerIndexDiff
>
#endif
struct
DynamicTensorCoordinateStep
struct
DynamicTensorCoordinateStep
{
{
// TODO make these private
// TODO make these private
...
@@ -234,6 +220,11 @@ struct DynamicTensorCoordinateStep
...
@@ -234,6 +220,11 @@ struct DynamicTensorCoordinateStep
const
VisibleIndex
idx_diff_visible_
;
const
VisibleIndex
idx_diff_visible_
;
const
MultiIndex
<
NTransform
>
do_transforms_
;
const
MultiIndex
<
NTransform
>
do_transforms_
;
#if 1 // hack
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
static
constexpr
HackCalculateLowerIndexDiff
hack_calculate_lower_index_diff_
;
#endif
};
};
// TODO: How to fix this? It uses an struct instead of lambda because lambda
// TODO: How to fix this? It uses an struct instead of lambda because lambda
...
@@ -406,7 +397,72 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
...
@@ -406,7 +397,72 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
set_container_subset
(
is_non_zero_diff
,
dims_low
,
non_zero_diff_pick_low
);
set_container_subset
(
is_non_zero_diff
,
dims_low
,
non_zero_diff_pick_low
);
});
});
#if 0 // hack
return DynamicTensorCoordinateStep<ntransform, ndim_visible>{idx_diff_visible, do_transforms};
return DynamicTensorCoordinateStep<ntransform, ndim_visible>{idx_diff_visible, do_transforms};
#else
return
DynamicTensorCoordinateStep
<
ntransform
,
ndim_visible
,
typename
uniform_sequence_gen
<
ntransform
,
0
>::
type
>
{
idx_diff_visible
,
do_transforms
};
#endif
}
#if 0 // hack
template <typename TensorDesc, typename VisibleIndex>
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
template
<
typename
TensorDesc
,
typename
VisibleIndex
,
typename
HackCalculateLowerIndexDiff
>
#endif
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate_step_hack
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
,
HackCalculateLowerIndexDiff
)
{
static_assert
(
TensorDesc
::
GetNumOfDimension
()
==
VisibleIndex
::
Size
(),
"wrong! # of dimension inconsistent"
);
constexpr
index_t
ntransform
=
TensorDesc
::
GetNumOfTransform
();
constexpr
index_t
ndim_hidden
=
TensorDesc
::
GetNumOfHiddenDimension
();
constexpr
index_t
ndim_visible
=
TensorDesc
::
GetNumOfVisibleDimension
();
constexpr
auto
visible_dim_ids
=
TensorDesc
::
GetVisibleDimensionIds
();
static_assert
(
HackCalculateLowerIndexDiff
::
Size
()
==
ntransform
,
"wrong!"
);
// use index_t for boolean type
auto
do_transforms
=
make_zero_multi_index
<
ntransform
>
();
auto
is_non_zero_diff
=
make_zero_multi_index
<
ndim_hidden
>
();
// decide do_transform by checkout non-zero index diff components
MultiIndex
<
VisibleIndex
::
Size
()
>
non_zero_diff_pick_visible
;
static_for
<
0
,
ndim_visible
,
1
>
{}(
[
&
](
auto
i
)
{
non_zero_diff_pick_visible
(
i
)
=
(
idx_diff_visible
[
i
]
!=
0
);
});
set_container_subset
(
is_non_zero_diff
,
visible_dim_ids
,
non_zero_diff_pick_visible
);
static_for
<
ntransform
-
1
,
-
1
,
-
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
dims_low
=
TensorDesc
::
GetLowerDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_up
=
TensorDesc
::
GetUpperDimensionIdss
().
At
(
itran
);
const
auto
non_zero_diff_pick_up
=
get_container_subset
(
is_non_zero_diff
,
dims_up
);
MultiIndex
<
dims_low
.
Size
()
>
non_zero_diff_pick_low
;
// if any of upper index diff components is non-zero, then
// 1) Need to do this transform
// 2) all components of lower index diff will assume to be non-zero and need to be
// computed
const
bool
idx_diff_up_has_non_zero
=
container_reduce
(
non_zero_diff_pick_up
,
[](
auto
a
,
auto
b
)
constexpr
{
return
a
or
b
;
},
false
);
do_transforms
(
itran
)
=
idx_diff_up_has_non_zero
;
static_for
<
0
,
dims_low
.
Size
(),
1
>
{}(
[
&
](
auto
i
)
{
non_zero_diff_pick_low
(
i
)
=
idx_diff_up_has_non_zero
;
});
set_container_subset
(
is_non_zero_diff
,
dims_low
,
non_zero_diff_pick_low
);
});
return
DynamicTensorCoordinateStep
<
ntransform
,
ndim_visible
,
HackCalculateLowerIndexDiff
>
{
idx_diff_visible
,
do_transforms
};
}
}
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
...
@@ -453,7 +509,17 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
...
@@ -453,7 +509,17 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
MultiIndex
<
dims_low
.
Size
()
>
idx_diff_low
;
MultiIndex
<
dims_low
.
Size
()
>
idx_diff_low
;
// calculate idx_diff_low
// calculate idx_diff_low
#if 0 // hack
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
// TODO remove hack
constexpr
index_t
Hack
=
decltype
(
coord_step
.
hack_calculate_lower_index_diff_
)
::
At
(
itran
);
tran
.
CalculateLowerIndexDiff_hack
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up
,
Number
<
Hack
>
{});
#endif
// update idx_low
// update idx_low
idx_low
+=
idx_diff_low
;
idx_low
+=
idx_diff_low
;
...
...
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
View file @
f2f35201
...
@@ -149,5 +149,146 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
...
@@ -149,5 +149,146 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
ThreadwiseTransfer
threadwise_transfer_
;
ThreadwiseTransfer
threadwise_transfer_
;
};
};
// this version does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
InMemoryDataOperation
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorDim
,
index_t
DstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
,
index_t
ThreadTransferSrcResetCoordinateAfterRun
,
index_t
ThreadTransferDstResetCoordinateAfterRun
>
struct
BlockwiseDynamicTensorSliceTransfer_v4_hack
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v4_hack
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
)
:
threadwise_transfer_
(
src_desc
,
make_zero_multi_index
<
nDim
>
(),
dst_desc
,
make_zero_multi_index
<
nDim
>
())
{
static_assert
(
nDim
==
remove_reference_t
<
remove_cv_t
<
SrcDesc
>>::
GetNumOfDimension
()
&&
nDim
==
remove_reference_t
<
remove_cv_t
<
DstDesc
>>::
GetNumOfDimension
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong! BlockSize too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_id
=
thread_cluster_desc_
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
thread_data_id_begin
=
thread_cluster_id
*
ThreadSliceLengths
{};
threadwise_transfer_
.
SetSrcSliceOrigin
(
src_desc
,
src_block_slice_origin
+
thread_data_id_begin
);
threadwise_transfer_
.
SetDstSliceOrigin
(
dst_desc
,
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
static
constexpr
auto
CalculateThreadDataBegin
()
{
const
auto
thread_cluster_id
=
thread_cluster_desc_
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
return
thread_cluster_id
*
ThreadSliceLengths
{};
}
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
RunRead
(
src_desc
,
p_src
);
}
}
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
RunWrite
(
dst_desc
,
p_dst
);
}
}
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrcSliceWindow
(
src_desc
,
step
);
}
}
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveDstSliceWindow
(
dst_desc
,
step
);
}
}
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
ThreadwiseDynamicTensorSliceTransfer_v3_hack
<
ThreadSliceLengths
,
DstInMemOp
,
SrcData
,
DstData
,
SrcDesc
,
DstDesc
,
SrcDimAccessOrder
,
DstDimAccessOrder
,
SrcVectorDim
,
DstVectorDim
,
SrcScalarPerVector
,
DstScalarPerVector
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
,
SrcAddressSpace
,
DstAddressSpace
,
ThreadTransferSrcResetCoordinateAfterRun
,
ThreadTransferDstResetCoordinateAfterRun
>
;
ThreadwiseTransfer
threadwise_transfer_
;
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
f2f35201
...
@@ -166,28 +166,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -166,28 +166,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_blockwise_copy
=
auto
b_blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v4
<
BlockSize
,
BlockwiseDynamicTensorSliceTransfer_v4
_hack
<
BlockSize
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
Sequence
<
KPerBlock
,
NPerBlock
>
,
Sequence
<
KPerBlock
,
NPerBlock
>
,
BBlockTransferThreadSliceLengths_K_N
,
BBlockTransferThreadSliceLengths_K_N
,
BBlockTransferThreadClusterLengths_K_N
,
BBlockTransferThreadClusterLengths_K_N
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferThreadClusterArrangeOrder
,
Float
,
Float
,
Float
,
Float
,
decltype
(
b_k_n_global_desc
),
decltype
(
b_k_n_global_desc
),
decltype
(
b_k_n_block_desc
),
decltype
(
b_k_n_block_desc
),
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcAccessOrder
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcVectorDim
,
1
,
1
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_N
,
BBlockTransferDstScalarPerVector_N
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
AddressSpace
::
Lds
,
AddressSpace
::
Lds
,
1
,
1
,
1
,
1
,
BThreadTransferSrcResetCoordinateAfterRun
,
BThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
true
>
(
b_k_n_global_desc
,
b_k_n_global_desc
,
make_multi_index
(
0
,
n_block_data_on_global
),
make_multi_index
(
0
,
n_block_data_on_global
),
b_k_n_block_desc
,
b_k_n_block_desc
,
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
f2f35201
...
@@ -646,5 +646,348 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -646,5 +646,348 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
DstCoord
dst_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
};
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
// 3. It does not use pointer for VGPR thread buffer
// 4. It calculate offset for thread buffer directly, instead of moving the coordinate
template
<
typename
SliceLengths
,
InMemoryDataOperation
DstInMemOp
,
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorDim
,
index_t
DstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
bool
SrcResetCoordinateAfterRun
,
// control whether to move back src coordinate after each
// RunRead(), will be fused with MoveSrcSliceWindow to
// save addr computation
bool
DstResetCoordinateAfterRun
>
// control whether to move back dst coordinate after each
// RunWrite(), will be fused with MoveDstSliceWindow to
// save addr computation
struct
ThreadwiseDynamicTensorSliceTransfer_v3_hack
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v3_hack
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
:
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_slice_origin_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
or
SrcAddressSpace
==
AddressSpace
::
Lds
,
"wrong!"
);
static_assert
(
DstAddressSpace
==
AddressSpace
::
Global
or
DstAddressSpace
==
AddressSpace
::
Lds
,
"wrong!"
);
}
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v3_hack
()
:
ThreadwiseDynamicTensorSliceTransfer_v3_hack
(
SrcDesc
{},
make_zero_multi_index
<
nDim
>
(),
DstDesc
{},
make_zero_multi_index
<
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
{
src_slice_origin_
=
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_slice_origin_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
// hardcoded for 2D
// TODO implemente N-D
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
#elif
0
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
#elif 1
// for padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 1
// for non-padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#endif
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
static_for
<
0
,
Len0
,
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
Len1
,
1
>
{}([
&
](
auto
iter1
)
{
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
Len1
-
iter1
.
value
-
1
;
// do work
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
));
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
buffer_
(
Number
<
buffer_offset
>
{})
=
amd_buffer_load
<
SrcData
,
1
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
());
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
{
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
}
}
});
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
Len0
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
}
});
}
// move src coordinate back to its slice origin
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
const
auto
src_back_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
GetCoordinateBackStep
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
}
}
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
static_assert
(
remove_reference_t
<
DstDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
// hardcoded for 2D
// TODO implement N-D
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
static_for
<
0
,
Len0
,
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
Len1
,
1
>
{}([
&
](
auto
iter1
)
{
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
index_t
i0
=
iter0
;
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
Len1
-
iter1
.
value
-
1
;
// do work
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
));
// hardcoding for ds_write
// TODO refactor transfer_data() to encapsulate this
static_assert
(
DstAddressSpace
==
AddressSpace
::
Lds
&&
DstInMemOp
==
InMemoryDataOperation
::
Set
,
"wrong! hardcoded for ds_write"
);
p_dst
[
dst_slice_origin_
.
GetOffset
()]
=
buffer_
[
Number
<
buffer_offset
>
{}];
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
{
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m1
);
}
}
});
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
Len0
-
1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0
);
}
});
}
// move dst coordinate back to its slice origin
if
constexpr
(
DstResetCoordinateAfterRun
)
{
const
auto
dst_back_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
GetCoordinateBackStep
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_back_step
);
}
}
__device__
static
constexpr
auto
GetCoordinateBackStep
()
{
MultiIndex
<
nDim
>
back_step
;
back_step
(
Number
<
0
>
{})
=
1
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
back_step
(
i
)
=
(
SliceLengths
{}[
i
-
Number
<
1
>
{}]
%
2
==
0
)
?
0
:
(
1
-
SliceLengths
{}[
i
]);
});
return
back_step
;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetCoordinateBackStep
();
// is it OK to construct a new step every time?
#if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif
1
// for padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#elif 1
// for non-paded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#endif
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetCoordinateBackStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
adjusted_step
);
}
private:
static
constexpr
auto
buffer_desc_
=
make_dynamic_naive_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
SliceLengths
{}));
static
constexpr
index_t
buffer_size_
=
buffer_desc_
.
GetElementSpaceSize
();
StaticallyIndexedArray
<
SrcData
,
buffer_size_
>
buffer_
;
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
}
// namespace ck
}
// namespace ck
#endif
#endif
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
f2f35201
...
@@ -87,7 +87,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -87,7 +87,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif
1
#elif
0
// cdata = 64, BlockSize = 256, 128x128x2
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
2
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmMLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
8
;
constexpr
index_t
GemmNLevel1Cluster
=
8
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
=
Sequence
<
1
,
1
>
;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmK
=
1
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmM
=
1
;
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
1
,
1
>
;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif 0
// cdata = 64, BlockSize = 256, 128x128x4
// cdata = 64, BlockSize = 256, 128x128x4
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
f2f35201
...
@@ -53,7 +53,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -53,7 +53,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
const
auto
in_left_pads
=
to_multi_index
(
InLeftPads
{});
const
auto
in_left_pads
=
to_multi_index
(
InLeftPads
{});
const
auto
in_right_pads
=
to_multi_index
(
InRightPads
{});
const
auto
in_right_pads
=
to_multi_index
(
InRightPads
{});
#if 1
#if 0
// cdata = 64, BlockSize = 256, 128x128x2
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 2;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 8;
constexpr index_t GemmNLevel1Cluster = 8;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<1, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<1, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif
0
// cdata = 64, BlockSize = 256, 128x128x4
// cdata = 64, BlockSize = 256, 128x128x4
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
@@ -138,7 +168,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -138,7 +168,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
conv_driver
=
constexpr
auto
conv_driver
=
#if
0
// debug
#if
1
// debug
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
#else
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
...
driver/src/conv_driver.cpp
View file @
f2f35201
...
@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
...
@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if
1
#if
0
// 3x3, 35x35, stride 2
// 3x3, 35x35, stride 2
constexpr index_t N = 128;
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t C = 192;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment