Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4e78d2fc
"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "666743302ff5bd1e02c204b81a80e566648d60de"
Commit
4e78d2fc
authored
Sep 11, 2020
by
Chao Liu
Browse files
adding dynamic tensor descriptor
parent
2ffa2708
Changes
14
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
789 additions
and
204 deletions
+789
-204
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+157
-14
composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
...ernel/include/kernel_algorithm/dummy_static_transform.hpp
+1
-1
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+2
-2
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
...ensor_description/dynamic_tensor_descriptor_helper_v2.hpp
+44
-36
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
...clude/tensor_description/dynamic_tensor_descriptor_v2.hpp
+293
-132
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+2
-2
composable_kernel/include/utility/array.hpp
composable_kernel/include/utility/array.hpp
+92
-9
composable_kernel/include/utility/array_helper.hpp
composable_kernel/include/utility/array_helper.hpp
+8
-6
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+1
-0
composable_kernel/include/utility/functional4.hpp
composable_kernel/include/utility/functional4.hpp
+22
-0
composable_kernel/include/utility/sequence_helper.hpp
composable_kernel/include/utility/sequence_helper.hpp
+15
-0
composable_kernel/include/utility/tuple_helper.hpp
composable_kernel/include/utility/tuple_helper.hpp
+13
-0
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+125
-0
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+14
-2
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
4e78d2fc
...
@@ -3,14 +3,17 @@
...
@@ -3,14 +3,17 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_v2.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor_helper_v2.hpp"
#include "dynamic_tensor_coordinate.hpp"
#include "dynamic_tensor_coordinate.hpp"
namespace
ck
{
namespace
ck
{
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
map_convolution_into_gemm
(
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
map_convolution_into_gemm
(
const
WeiDesc
&
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
InDesc
&
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
OutDesc
&
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
...
@@ -78,15 +81,88 @@ map_convolution_into_gemm(const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_glo
...
@@ -78,15 +81,88 @@ map_convolution_into_gemm(const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_glo
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__host__
__device__
constexpr
auto
map_convolution_into_gemm_v2
(
const
WeiDesc
&
wei_k_c_y_x_global_desc
,
const
InDesc
&
in_n_c_hi_wi_global_desc
,
const
OutDesc
&
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
{
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
1
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{})),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicRightPad
{
Hi
+
InLeftPadH
,
InRightPadH
},
DynamicRightPad
{
Wi
+
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
2
);
const
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
3
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicEmbed
<
2
>
{{
Y
,
Ho
},
{
ConvDilationH
,
ConvStrideH
,
0
}},
DynamicEmbed
<
2
>
{{
X
,
Wo
},
{
ConvDilationW
,
ConvStrideW
,
0
}}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{{
C
,
Y
,
X
}},
DynamicMerge
<
3
>
{{
N
,
Ho
,
Wo
}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
}
template
<
index_t
BlockSize
>
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform
struct
DummyDynamicTransform
{
{
__device__
void
Run_v1
(
index_t
*
const
__restrict__
p_wei_global
,
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run_v0
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
...
@@ -520,12 +596,13 @@ struct DummyDynamicTransform
...
@@ -520,12 +596,13 @@ struct DummyDynamicTransform
}
}
}
}
__device__
void
Run_v2
(
index_t
*
const
__restrict__
p_wei_global
,
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run_v1
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
...
@@ -583,12 +660,78 @@ struct DummyDynamicTransform
...
@@ -583,12 +660,78 @@ struct DummyDynamicTransform
}
}
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run_v2
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
const
auto
transformed_tensor_descs
=
map_convolution_into_gemm_v2
(
wei_k_c_y_x_global_desc
,
in_n_c_hi_wi_global_desc
,
out_n_k_ho_wo_global_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
transformed_tensor_descs
.
At
(
Number
<
0
>
{});
MultiIndex
<
2
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
const
index_t
niter
=
p_wei_global
[
10
];
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
idx
);
constexpr
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{{
1
,
0
}});
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
// write
float
value
=
1
;
transfer_data
<
float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_out_global
,
in_gemmk_gemmn_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
),
in_gemmk_gemmn_global_desc
.
GetElementSpaceSize
());
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
...
...
composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
View file @
4e78d2fc
...
@@ -99,7 +99,7 @@ struct DummyStaticTransform
...
@@ -99,7 +99,7 @@ struct DummyStaticTransform
#pragma unroll 1
#pragma unroll 1
for
(
index_t
k
=
0
;
k
<
100
;
++
k
)
for
(
index_t
k
=
0
;
k
<
100
;
++
k
)
{
{
coord
+=
Array
<
index_t
,
2
>
{
8
,
0
};
coord
+=
Array
<
index_t
,
2
>
{
{
8
,
0
}
}
;
Float
value
=
1
;
Float
value
=
1
;
transfer_data
<
Float
,
transfer_data
<
Float
,
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
4e78d2fc
...
@@ -212,9 +212,9 @@ struct DynamicEmbed
...
@@ -212,9 +212,9 @@ struct DynamicEmbed
{
{
}
}
__host__
__device__
static
constexpr
index_t
GetNumOf
Upp
erDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOf
Low
erDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOf
Low
erDimension
()
{
return
NDimUp
;
}
__host__
__device__
static
constexpr
index_t
GetNumOf
Upp
erDimension
()
{
return
NDimUp
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
up_lengths_
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
up_lengths_
;
}
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
View file @
4e78d2fc
...
@@ -6,44 +6,52 @@
...
@@ -6,44 +6,52 @@
namespace
ck
{
namespace
ck
{
template
<
typename
LowerTensorDescriptor
,
template
<
index_t
N
>
typename
Transforms
,
typename
LowerVisibleDimensionLowerVisibleIdss
,
typename
UpperVisibleDimensionUpperVisibleIdss
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
transform_dynamic_tensor_descriptor_v2
(
const
LowerTensorDescriptor
&
low_tensor_desc
,
make_dynamic_native_tensor_descriptor_packed_v2
(
const
MultiIndex
<
N
>&
lengths
)
const
Transforms
&
transforms
,
LowerVisibleDimensionLowerVisibleIdss
,
UpperVisibleDimensionUpperVisibleIdss
)
{
{
// convert lower visible dimension idss (tuple of sequences) to hidden dimension idss (tuple of sequences)
constexpr
auto
low_visible_dimension_hidden_idss
=
transform_tuples
(
const
auto
transforms
=
make_tuple
(
DynamicUnMerge
<
N
>
{
lengths
});
// convert lower visible dimension ids (a sequence) to hidden dimension ids (a sequence)
constexpr
auto
low_dim_hidden_idss
=
make_tuple
(
Sequence
<
0
>
{});
[](
auto
low_visible_dim_ids
)
{
constexpr
auto
up_dim_hidden_idss
=
return
transform_sequences
(
make_tuple
(
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{});
// convert lower visible dimension id to hidden dimension id
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
0
,
N
,
1
>::
type
{};
[](
auto
low_visible_dim_id
)
{
return
low_tensor_desc
.
GetVisibleDimensionIds
()[
low_visible_dim_id
];
const
index_t
element_space_size
=
},
reduce_on_array
(
lengths
,
math
::
multiplies
<
index_t
>
{},
index_t
{
1
});
low_visible_dim_ids
);
},
return
DynamicTensorDescriptor_v2
<
decltype
(
transforms
),
LowerVisibleDimensionLowerVisibleIdss
{});
decltype
(
low_dim_hidden_idss
),
decltype
(
up_dim_hidden_idss
),
constexpr
auto
up_visible_dims_
decltype
(
visible_dim_hidden_ids
)
>
{
transforms
,
element_space_size
};
const
auto
all_transforms
=
merge_tuples
(
old_tensor_desc
.
GetTransforms
(),
new_transforms
);
}
constexpr
auto
all_low_dim_idss
=
merge_tuples
(
old_tensor_desc
.
GetLowerDimensionIdss
(),
new_low_dim_idss
);
template
<
index_t
N
>
constexpr
auto
all_up_dim_idss
=
__host__
__device__
constexpr
auto
merge_tuples
(
old_tensor_desc
.
GetUpperDimensionIdss
(),
new_up_dim_idss
);
make_dynamic_native_tensor_descriptor_v2
(
const
MultiIndex
<
N
>&
lengths
,
const
MultiIndex
<
N
>&
strides
)
{
constexpr
auto
new_visible_dim_ids
=
new_up_dim_idss
const
auto
coefficients
=
strides
.
PushBack
(
index_t
{
0
});
return
DynamicTensorDescriptor_v2
<
decltype
(
all_transforms
),
const
auto
transforms
=
make_tuple
(
DynamicEmbed
<
N
>
{
lengths
,
coefficients
});
decltype
(
all_low_dim_idss
),
constexpr
auto
low_dim_hidden_idss
=
make_tuple
(
Sequence
<
0
>
{});
decltype
(
all_up_dim_idss
),
constexpr
auto
up_dim_hidden_idss
=
decltype
(
new_visible_dim_ids
)
>
{
make_tuple
(
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{});
all_transforms
,
old_tensor_desc
.
GetElementSpaceSize
()};
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
0
,
N
,
1
>::
type
{};
index_t
element_space_size
=
1
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
N
;
++
i
)
{
element_space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
DynamicTensorDescriptor_v2
<
decltype
(
transforms
),
decltype
(
low_dim_hidden_idss
),
decltype
(
up_dim_hidden_idss
),
decltype
(
visible_dim_hidden_ids
)
>
{
transforms
,
element_space_size
};
}
}
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
View file @
4e78d2fc
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
4e78d2fc
...
@@ -17,7 +17,7 @@ __host__ __device__ constexpr auto make_multi_index(Xs... xs)
...
@@ -17,7 +17,7 @@ __host__ __device__ constexpr auto make_multi_index(Xs... xs)
template
<
index_t
NSize
>
template
<
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_multi_index
()
__host__
__device__
constexpr
auto
make_zero_multi_index
()
{
{
make_zero_array
<
index_t
,
NSize
>
();
return
make_zero_array
<
index_t
,
NSize
>
();
}
}
template
<
index_t
Length
>
template
<
index_t
Length
>
...
@@ -425,7 +425,7 @@ struct Embed
...
@@ -425,7 +425,7 @@ struct Embed
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
{
LowerIndex
idx_low
(
Coefficients
{}[
nDimUp
]
)
;
LowerIndex
idx_low
=
{
Coefficients
{}[
nDimUp
]
}
;
for
(
index_t
i
=
0
;
i
<
nDimUp
;
++
i
)
for
(
index_t
i
=
0
;
i
<
nDimUp
;
++
i
)
{
{
...
...
composable_kernel/include/utility/array.hpp
View file @
4e78d2fc
...
@@ -16,14 +16,22 @@ struct Array
...
@@ -16,14 +16,22 @@ struct Array
// TODO: implement empty Array
// TODO: implement empty Array
TData
mData
[
NSize
+
1
]
=
{
0
};
TData
mData
[
NSize
+
1
]
=
{
0
};
__host__
__device__
explicit
constexpr
Array
()
{}
#if 0
__host__ __device__ explicit constexpr Array(TData x)
template
<
typename
X
,
typename
...
Xs
>
: mData{x}
__host__
__device__
constexpr
Array
(
X
x
,
Xs
...
xs
)
{}
:
mData
{
static_cast
<
TData
>
(
x
),
static_cast
<
TData
>
(
xs
)...}
__host__ __device__ explicit constexpr Array()
: Array(TData{0})
{}
template <typename... Xs>
__host__ __device__ constexpr Array(Xs... xs)
: mData{static_cast<TData>(xs)...}
{
{
static_assert
(
sizeof
...(
Xs
)
+
1
==
NSize
,
"wrong! size"
);
static_assert(sizeof...(Xs) == NSize, "wrong! size");
}
}
#endif
__host__
__device__
static
constexpr
index_t
Size
()
{
return
NSize
;
}
__host__
__device__
static
constexpr
index_t
Size
()
{
return
NSize
;
}
...
@@ -63,13 +71,71 @@ struct Array
...
@@ -63,13 +71,71 @@ struct Array
}
}
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
constexpr
type
&
operator
=
(
const
T
&
x
)
__host__
__device__
constexpr
auto
operator
=
(
const
T
&
a
)
{
static_assert
(
T
::
Size
()
==
Size
(),
"wrong! size not the same"
);
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
=
a
[
i
];
});
return
*
this
;
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
operator
+=
(
const
T
&
a
)
{
static_assert
(
T
::
Size
()
==
Size
(),
"wrong! size not the same"
);
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
+=
a
[
i
];
});
return
*
this
;
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
operator
-=
(
const
T
&
a
)
{
{
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
=
x
[
i
];
});
static_assert
(
T
::
Size
()
==
Size
(),
"wrong! size not the same"
);
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
-=
a
[
i
];
});
return
*
this
;
return
*
this
;
}
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
operator
+
(
const
T
&
a
)
const
{
static_assert
(
T
::
Size
()
==
Size
(),
"wrong! size not the same"
);
type
r
;
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
r
(
i
)
=
operator
[](
i
)
+
a
[
i
];
});
return
r
;
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
operator
-
(
const
T
&
a
)
const
{
static_assert
(
T
::
Size
()
==
Size
(),
"wrong! size not the same"
);
type
r
;
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
r
(
i
)
=
operator
[](
i
)
-
a
[
i
];
});
return
r
;
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
operator
*
(
const
T
&
a
)
const
{
static_assert
(
T
::
Size
()
==
Size
(),
"wrong! size not the same"
);
type
r
;
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
r
(
i
)
=
operator
[](
i
)
*
a
[
i
];
});
return
r
;
}
struct
lambda_PushBack
// emulate constexpr lambda
struct
lambda_PushBack
// emulate constexpr lambda
{
{
const
Array
<
TData
,
NSize
>&
old_array
;
const
Array
<
TData
,
NSize
>&
old_array
;
...
@@ -150,13 +216,30 @@ struct ArrayElementPicker
...
@@ -150,13 +216,30 @@ struct ArrayElementPicker
}
}
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
constexpr
type
&
operator
=
(
const
T
&
a
)
__host__
__device__
constexpr
auto
operator
=
(
const
T
&
a
)
{
{
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
=
a
[
i
];
});
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
=
a
[
i
];
});
return
*
this
;
return
*
this
;
}
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
operator
+=
(
const
T
&
a
)
{
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
+=
a
[
i
];
});
return
*
this
;
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
operator
-=
(
const
T
&
a
)
{
static_for
<
0
,
Size
(),
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
-=
a
[
i
];
});
return
*
this
;
}
private:
Arr
&
mArray
;
Arr
&
mArray
;
};
};
...
...
composable_kernel/include/utility/array_helper.hpp
View file @
4e78d2fc
...
@@ -115,6 +115,7 @@ struct lambda_array_math
...
@@ -115,6 +115,7 @@ struct lambda_array_math
}
}
};
};
#if 0
// Array = Array + Array
// Array = Array + Array
template <typename TData, index_t NSize>
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
...
@@ -210,6 +211,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
...
@@ -210,6 +211,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
return result;
return result;
}
}
#endif
// Array = Sequence - Array
// Array = Sequence - Array
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
...
@@ -242,15 +244,15 @@ __host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
...
@@ -242,15 +244,15 @@ __host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
return
result
;
return
result
;
}
}
template
<
typename
TData
,
index_t
NSize
,
typename
Reduce
>
template
<
typename
TData
,
typename
Arr
,
typename
Reduce
>
__host__
__device__
constexpr
TData
__host__
__device__
constexpr
TData
reduce_on_array
(
const
Arr
&
a
,
Reduce
f
,
TData
init
)
reduce_on_array
(
const
Array
<
TData
,
NSize
>&
a
,
Reduce
f
,
TData
init
)
{
{
TData
result
=
init
;
static_assert
(
is_same
<
typename
Arr
::
data_type
,
TData
>::
value
,
"wrong! different data type"
);
static_assert
(
Arr
::
Size
()
>
0
,
"wrong"
);
static_assert
(
NSize
>
0
,
"wrong"
)
;
TData
result
=
init
;
static_for
<
0
,
N
Size
,
1
>
{}([
&
](
auto
I
)
{
result
=
f
(
result
,
a
[
I
]);
});
static_for
<
0
,
Arr
::
Size
()
,
1
>
{}([
&
](
auto
I
)
{
result
=
f
(
result
,
a
[
I
]);
});
return
result
;
return
result
;
}
}
...
...
composable_kernel/include/utility/common_header.hpp
View file @
4e78d2fc
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "math.hpp"
#include "math.hpp"
#include "number.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "synchronization.hpp"
#include "synchronization.hpp"
#include "tuple.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "tuple_helper.hpp"
...
...
composable_kernel/include/utility/functional4.hpp
View file @
4e78d2fc
...
@@ -22,6 +22,20 @@ struct unpack_impl<Sequence<Is...>>
...
@@ -22,6 +22,20 @@ struct unpack_impl<Sequence<Is...>>
}
}
};
};
template
<
typename
Seq0
,
typename
Seq1
>
struct
unpack2_impl
;
// TODO: remove this, after properly implementing unpack that takes any number of containers
template
<
index_t
...
Is
,
index_t
...
Js
>
struct
unpack2_impl
<
Sequence
<
Is
...
>
,
Sequence
<
Js
...
>>
{
template
<
typename
F
,
typename
X
,
typename
Y
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
const
X
&
x
,
const
Y
&
y
)
const
{
return
f
(
x
.
At
(
Number
<
Is
>
{})...,
y
.
At
(
Number
<
Js
>
{})...);
}
};
}
// namespace detail
}
// namespace detail
template
<
typename
F
,
typename
X
>
template
<
typename
F
,
typename
X
>
...
@@ -30,5 +44,13 @@ __host__ __device__ constexpr auto unpack(F f, const X& x)
...
@@ -30,5 +44,13 @@ __host__ __device__ constexpr auto unpack(F f, const X& x)
return
detail
::
unpack_impl
<
typename
arithmetic_sequence_gen
<
0
,
X
::
Size
(),
1
>::
type
>
{}(
f
,
x
);
return
detail
::
unpack_impl
<
typename
arithmetic_sequence_gen
<
0
,
X
::
Size
(),
1
>::
type
>
{}(
f
,
x
);
}
}
// TODO: properly implement unpack that takes any number of containers
template
<
typename
F
,
typename
X
,
typename
Y
>
__host__
__device__
constexpr
auto
unpack
(
F
f
,
const
X
&
x
,
const
Y
&
y
)
{
return
detail
::
unpack2_impl
<
typename
arithmetic_sequence_gen
<
0
,
X
::
Size
(),
1
>::
type
,
typename
arithmetic_sequence_gen
<
0
,
Y
::
Size
(),
1
>::
type
>
{}(
f
,
x
,
y
);
}
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/sequence_helper.hpp
0 → 100644
View file @
4e78d2fc
#ifndef CK_SEQUENCE_HELPER_HPP
#define CK_SEQUENCE_HELPER_HPP
#include "sequence_helper.hpp"
namespace
ck
{
template
<
typename
F
,
index_t
N
>
__host__
__device__
constexpr
auto
generate_sequence
(
F
,
Number
<
N
>
)
{
return
typename
sequence_gen
<
N
,
F
>::
type
{};
}
}
// namespace ck
#endif
composable_kernel/include/utility/tuple_helper.hpp
View file @
4e78d2fc
...
@@ -11,6 +11,19 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
...
@@ -11,6 +11,19 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
return
Tuple
<
remove_cv_t
<
remove_reference_t
<
Xs
>>
...
>
(
std
::
forward
<
Xs
>
(
xs
)...);
return
Tuple
<
remove_cv_t
<
remove_reference_t
<
Xs
>>
...
>
(
std
::
forward
<
Xs
>
(
xs
)...);
}
}
template
<
typename
F
,
index_t
N
>
__host__
__device__
constexpr
auto
generate_tuple
(
F
&&
f
,
Number
<
N
>
)
{
return
unpack
([
&
f
](
auto
&&
...
xs
)
{
return
make_tuple
(
f
(
xs
)...);
},
typename
arithmetic_sequence_gen
<
0
,
N
,
1
>::
type
{});
}
template
<
typename
...
Tuples
>
__host__
__device__
constexpr
auto
merge_tuples
(
Tuples
...
tuples
)
{
return
unpack
([
&
tuples
...](
auto
...
xs
)
{
return
make_tuple
(
xs
...);
},
tuples
...);
}
namespace
detail
{
namespace
detail
{
template
<
typename
F
,
typename
X
,
index_t
...
Is
>
template
<
typename
F
,
typename
X
,
index_t
...
Is
>
...
...
driver/include/device_dummy_dynamic_transform.hpp
View file @
4e78d2fc
...
@@ -138,3 +138,128 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -138,3 +138,128 @@ void device_dummy_dynamic_transform(InDesc,
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
}
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform_v2
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
const
auto
in_nchw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
InDesc
::
GetLengths
()),
to_array
(
InDesc
::
GetStrides
()));
const
auto
wei_kcyx_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
WeiDesc
::
GetLengths
()),
to_array
(
WeiDesc
::
GetStrides
()));
const
auto
out_nkhw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
OutDesc
::
GetLengths
()),
to_array
(
OutDesc
::
GetStrides
()));
const
auto
conv_strides
=
to_array
(
ConvStrides
{});
const
auto
conv_dilations
=
to_array
(
ConvDilations
{});
const
auto
in_left_pads
=
to_array
(
InLeftPads
{});
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
{
const
auto
tensor_descs
=
map_convolution_into_gemm_v2
(
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
0
});
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
1
,
0
});
for
(
index_t
iter
=
0
;
iter
<
100
;
++
iter
)
{
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
1
,
0
};
printf
(
"iter %d
\n
"
,
iter
);
print_array
(
"idx: "
,
in_gemmk_gemmn_coord
.
GetIndex
());
printf
(
"offset: %d
\n
"
,
in_gemmk_gemmn_coord
.
GetOffset
());
printf
(
"
\n
"
);
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
}
}
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GridSize
=
1
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
dummy_transform
=
DummyDynamicTransform
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
dummy_transform
,
index_t
*
const
,
float
*
const
,
float
*
const
,
const
decltype
(
wei_kcyx_desc
),
const
decltype
(
in_nchw_desc
),
const
decltype
(
out_nkhw_desc
),
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
}
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/src/conv_driver.cpp
View file @
4e78d2fc
...
@@ -560,7 +560,7 @@ int main(int argc, char* argv[])
...
@@ -560,7 +560,7 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -584,7 +584,7 @@ int main(int argc, char* argv[])
...
@@ -584,7 +584,7 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif
1
#elif
0
device_dummy_dynamic_transform
(
in_nchw_desc
,
device_dummy_dynamic_transform
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -596,6 +596,18 @@ int main(int argc, char* argv[])
...
@@ -596,6 +596,18 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif 1
device_dummy_dynamic_transform_v2
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#endif
#endif
if
(
do_verification
)
if
(
do_verification
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment