Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
fc7a6c85
Commit
fc7a6c85
authored
Sep 23, 2020
by
Chao Liu
Browse files
refactor
parent
f3099df8
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
662 additions
and
342 deletions
+662
-342
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
...l/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
+21
-307
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
...l/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
+341
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
...clude/tensor_description/dynamic_tensor_descriptor_v2.hpp
+27
-0
driver/include/device_dummy_dynamic_transform_v1.hpp
driver/include/device_dummy_dynamic_transform_v1.hpp
+140
-0
driver/include/device_dummy_dynamic_transform_v2.hpp
driver/include/device_dummy_dynamic_transform_v2.hpp
+109
-0
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+24
-35
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
→
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform
_v1
.hpp
View file @
fc7a6c85
#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_
V1_
HPP
#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#define CK_DUMMY_DYNAMIC_TRANSFORM_
V1_
HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_v2.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor_helper_v2.hpp"
#include "dynamic_tensor_coordinate.hpp"
#include "dynamic_tensor_coordinate.hpp"
namespace
ck
{
namespace
ck
{
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
map_convolution_into_gemm
(
const
WeiDesc
&
wei_k_c_y_x_global_desc
,
map_convolution_into_gemm
_v1
(
const
WeiDesc
&
wei_k_c_y_x_global_desc
,
const
InDesc
&
in_n_c_hi_wi_global_desc
,
const
InDesc
&
in_n_c_hi_wi_global_desc
,
const
OutDesc
&
out_n_k_ho_wo_global_desc
,
const
OutDesc
&
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_strides
,
...
@@ -82,84 +81,11 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
...
@@ -82,84 +81,11 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__host__
__device__
constexpr
auto
map_convolution_into_gemm_v2
(
const
WeiDesc
&
wei_k_c_y_x_global_desc
,
const
InDesc
&
in_n_c_hi_wi_global_desc
,
const
OutDesc
&
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
{
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
1
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{})),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicRightPad
{
Hi
+
InLeftPadH
,
InRightPadH
},
DynamicRightPad
{
Wi
+
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
2
);
const
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
3
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicEmbed
<
2
>
{{
Y
,
Ho
},
{
ConvDilationH
,
ConvStrideH
,
0
}},
DynamicEmbed
<
2
>
{{
X
,
Wo
},
{
ConvDilationW
,
ConvStrideW
,
0
}}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
C
,
Y
,
X
}}},
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
N
,
Ho
,
Wo
}}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
}
template
<
index_t
BlockSize
>
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform_1
struct
DummyDynamicTransform_
v
1
{
{
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run_
0
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run_
1
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
...
@@ -599,7 +525,7 @@ struct DummyDynamicTransform_1
...
@@ -599,7 +525,7 @@ struct DummyDynamicTransform_1
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run_
1
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run_
2
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
...
@@ -610,7 +536,8 @@ struct DummyDynamicTransform_1
...
@@ -610,7 +536,8 @@ struct DummyDynamicTransform_1
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
{
const
auto
transformed_tensor_descs
=
map_convolution_into_gemm
(
wei_k_c_y_x_global_desc
,
const
auto
transformed_tensor_descs
=
map_convolution_into_gemm_v1
(
wei_k_c_y_x_global_desc
,
in_n_c_hi_wi_global_desc
,
in_n_c_hi_wi_global_desc
,
out_n_k_ho_wo_global_desc
,
out_n_k_ho_wo_global_desc
,
conv_strides
,
conv_strides
,
...
@@ -662,71 +589,6 @@ struct DummyDynamicTransform_1
...
@@ -662,71 +589,6 @@ struct DummyDynamicTransform_1
}
}
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run_2
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
const
auto
transformed_tensor_descs
=
map_convolution_into_gemm_v2
(
wei_k_c_y_x_global_desc
,
in_n_c_hi_wi_global_desc
,
out_n_k_ho_wo_global_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
transformed_tensor_descs
.
At
(
Number
<
0
>
{});
MultiIndex
<
2
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
const
index_t
niter
=
p_wei_global
[
10
];
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
idx
);
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{{
1
,
0
}});
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
// write
float
value
=
1
;
transfer_data
<
float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_out_global
,
in_gemmk_gemmn_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
),
in_gemmk_gemmn_global_desc
.
GetElementSpaceSize
());
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_in_global
,
...
@@ -752,153 +614,5 @@ struct DummyDynamicTransform_1
...
@@ -752,153 +614,5 @@ struct DummyDynamicTransform_1
}
}
};
};
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform_2
{
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
1
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicPassThrough
{
Hi
},
DynamicPassThrough
{
Wi
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
MultiIndex
<
4
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
const
index_t
niter
=
p_wei_global
[
10
];
auto
in_coord
=
make_dynamic_tensor_coordinate_v2
(
in_n_c_hip_wip_global_desc
,
idx
);
const
auto
in_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_n_c_hip_wip_global_desc
,
MultiIndex
<
4
>
{{
1
,
0
,
0
,
0
}});
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
move_dynamic_tensor_coordinate_v2
(
in_n_c_hip_wip_global_desc
,
in_coord
,
in_coord_step
);
// write
float
value
=
1
;
transfer_data
<
float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_out_global
,
in_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
in_n_c_hip_wip_global_desc
,
in_coord
),
in_n_c_hip_wip_global_desc
.
GetElementSpaceSize
());
}
}
};
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform_3
{
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
,
typename
TransformInDesc
>
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
TransformInDesc
in_gemmk_gemmn_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
MultiIndex
<
2
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
const
index_t
niter
=
p_wei_global
[
10
];
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
idx
);
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{{
1
,
0
}});
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
// write
float
value
=
1
;
transfer_data
<
float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_out_global
,
in_gemmk_gemmn_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
),
in_gemmk_gemmn_global_desc
.
GetElementSpaceSize
());
}
}
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
0 → 100644
View file @
fc7a6c85
#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_V2_HPP
#define CK_DUMMY_DYNAMIC_TRANSFORM_V2_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor_v2.hpp"
#include "dynamic_tensor_descriptor_helper_v2.hpp"
namespace
ck
{
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
>
__host__
__device__
constexpr
auto
map_convolution_into_gemm_v2
(
const
WeiDesc
&
wei_k_c_y_x_global_desc
,
const
InDesc
&
in_n_c_hi_wi_global_desc
,
const
OutDesc
&
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
{
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
1
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{})),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicRightPad
{
Hi
+
InLeftPadH
,
InRightPadH
},
DynamicRightPad
{
Wi
+
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
2
);
const
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
3
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicEmbed
<
2
>
{{
Y
,
Ho
},
{
ConvDilationH
,
ConvStrideH
,
0
}},
DynamicEmbed
<
2
>
{{
X
,
Wo
},
{
ConvDilationW
,
ConvStrideW
,
0
}}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
C
,
Y
,
X
}}},
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
N
,
Ho
,
Wo
}}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
}
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform_v2
{
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
,
typename
TransformInDesc
>
__device__
void
Run_1
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
TransformInDesc
/* in_gemmk_gemmn_global_desc */
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
const
auto
transformed_tensor_descs
=
map_convolution_into_gemm_v2
(
move
(
wei_k_c_y_x_global_desc
),
move
(
in_n_c_hi_wi_global_desc
),
move
(
out_n_k_ho_wo_global_desc
),
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
transformed_tensor_descs
.
At
(
Number
<
0
>
{});
MultiIndex
<
2
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
idx
);
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{{
1
,
0
}});
#pragma unroll
for
(
index_t
i
=
0
;
i
<
10
;
++
i
)
{
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
p_out_global
[
in_gemmk_gemmn_coord
.
GetOffset
()]
=
1
;
}
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
,
typename
TransformInDesc
>
__device__
void
Run_2
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
TransformInDesc
/* in_gemmk_gemmn_global_desc */
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
1
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
#if 0
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
move(in_n_c_hi_wi_global_desc),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicPassThrough{Hi},
DynamicPassThrough{Wi}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
#elif
0
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
move
(
in_n_c_hi_wi_global_desc
),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
#else
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
transform_dynamic_tensor_descriptor_v2
(
move
(
in_n_c_hi_wi_global_desc
),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{})),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicRightPad
{
Hi
+
InLeftPadH
,
InRightPadH
},
DynamicRightPad
{
Wi
+
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
#endif
MultiIndex
<
4
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
#if 0
const index_t niter = p_wei_global[10];
auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, idx);
const auto in_coord_step = make_dynamic_tensor_coordinate_step_v2(
in_n_c_hip_wip_global_desc, MultiIndex<4>{{1, 0, 0, 0}});
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, in_coord, in_coord_step);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
in_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_n_c_hip_wip_global_desc, in_coord),
in_n_c_hip_wip_global_desc.GetElementSpaceSize());
}
#else
// write
// auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hi_wi_global_desc, idx);
p_out_global
[
in_n_c_hip_wip_global_desc
.
CalculateOffset
(
idx
)]
=
1
;
#endif
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
,
typename
TransformInDesc
>
__device__
void
Run_3
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
/* wei_k_c_y_x_global_desc */
,
const
InDesc
/* in_n_c_hi_wi_global_desc */
,
const
OutDesc
/* out_n_k_ho_wo_global_desc */
,
const
TransformInDesc
in_gemmk_gemmn_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
MultiIndex
<
2
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
#if 0
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
in_gemmk_gemmn_global_desc, MultiIndex<2>{{1, 0}});
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate_v2(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
}
#else
p_out_global
[
in_gemmk_gemmn_global_desc
.
CalculateOffset
(
idx
)]
=
1
;
#endif
}
template
<
typename
WeiDesc
,
typename
InDesc
,
typename
OutDesc
,
typename
TransformInDesc
>
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
float
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
WeiDesc
wei_k_c_y_x_global_desc
,
const
InDesc
in_n_c_hi_wi_global_desc
,
const
OutDesc
out_n_k_ho_wo_global_desc
,
const
TransformInDesc
in_gemmk_gemmn_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
Run_1
(
p_wei_global
,
p_in_global
,
p_out_global
,
wei_k_c_y_x_global_desc
,
in_n_c_hi_wi_global_desc
,
out_n_k_ho_wo_global_desc
,
in_gemmk_gemmn_global_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
View file @
fc7a6c85
...
@@ -125,7 +125,34 @@ struct DynamicTensorDescriptor_v2
...
@@ -125,7 +125,34 @@ struct DynamicTensorDescriptor_v2
{
{
static_assert
(
Idx
::
Size
()
==
GetNumOfDimension
(),
"wrong! inconsistent # of dimension"
);
static_assert
(
Idx
::
Size
()
==
GetNumOfDimension
(),
"wrong! inconsistent # of dimension"
);
#if 0 // debug
return make_dynamic_tensor_coordinate_v2(*this, idx).GetOffset();
return make_dynamic_tensor_coordinate_v2(*this, idx).GetOffset();
#else
constexpr
index_t
ntransform
=
GetNumOfTransform
();
constexpr
index_t
ndim_hidden
=
GetNumOfHiddenDimension
();
constexpr
index_t
ndim_visible
=
GetNumOfVisibleDimension
();
constexpr
auto
visible_dim_ids
=
GetVisibleDimensionIds
();
MultiIndex
<
ndim_hidden
>
idx_hidden
;
// initialize visible index
auto
idx_hidden_pick_visible
=
pick_array_element
(
idx_hidden
,
visible_dim_ids
);
idx_hidden_pick_visible
=
idx
;
// calculate hidden index
static_for
<
ntransform
-
1
,
-
1
,
-
1
>
{}([
this
,
&
idx_hidden
](
auto
itran
)
{
const
auto
&
tran
=
this
->
GetTransforms
().
At
(
itran
);
constexpr
auto
dims_low
=
GetLowerDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_up
=
GetUpperDimensionIdss
().
At
(
itran
);
const
auto
idx_up
=
pick_array_element
(
idx_hidden
,
dims_up
);
auto
idx_low
=
pick_array_element
(
idx_hidden
,
dims_low
);
tran
.
CalculateLowerIndex
(
idx_low
,
idx_up
);
});
return
idx_hidden
[
0
];
#endif
}
}
// private:
// private:
...
...
driver/include/device_dummy_dynamic_transform.hpp
→
driver/include/device_dummy_dynamic_transform
_v1
.hpp
View file @
fc7a6c85
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_dynamic_transform.hpp"
#include "dummy_dynamic_transform
_v1
.hpp"
template
<
class
T
,
template
<
class
T
,
class
InDesc
,
class
InDesc
,
...
@@ -12,7 +12,7 @@ template <class T,
...
@@ -12,7 +12,7 @@ template <class T,
class
ConvDilations
,
class
ConvDilations
,
class
InLeftPads
,
class
InLeftPads
,
class
InRightPads
>
class
InRightPads
>
void
device_dummy_dynamic_transform_1
(
InDesc
,
void
device_dummy_dynamic_transform_
v
1
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Tensor
<
T
>&
wei_kcyx
,
...
@@ -41,7 +41,7 @@ void device_dummy_dynamic_transform_1(InDesc,
...
@@ -41,7 +41,7 @@ void device_dummy_dynamic_transform_1(InDesc,
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
{
{
const
auto
tensor_descs
=
map_convolution_into_gemm
(
wei_kcyx_desc
,
const
auto
tensor_descs
=
map_convolution_into_gemm
_v1
(
wei_kcyx_desc
,
in_nchw_desc
,
in_nchw_desc
,
out_nkhw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_strides
,
...
@@ -97,7 +97,7 @@ void device_dummy_dynamic_transform_1(InDesc,
...
@@ -97,7 +97,7 @@ void device_dummy_dynamic_transform_1(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
dummy_transform
=
DummyDynamicTransform_1
<
BlockSize
>
;
using
dummy_transform
=
DummyDynamicTransform_
v
1
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
...
@@ -138,229 +138,3 @@ void device_dummy_dynamic_transform_1(InDesc,
...
@@ -138,229 +138,3 @@ void device_dummy_dynamic_transform_1(InDesc,
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
}
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform_2
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
const
auto
in_nchw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
InDesc
::
GetLengths
()),
to_array
(
InDesc
::
GetStrides
()));
const
auto
wei_kcyx_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
WeiDesc
::
GetLengths
()),
to_array
(
WeiDesc
::
GetStrides
()));
const
auto
out_nkhw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
OutDesc
::
GetLengths
()),
to_array
(
OutDesc
::
GetStrides
()));
const
auto
conv_strides
=
to_array
(
ConvStrides
{});
const
auto
conv_dilations
=
to_array
(
ConvDilations
{});
const
auto
in_left_pads
=
to_array
(
InLeftPads
{});
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
{
const
auto
tensor_descs
=
map_convolution_into_gemm_v2
(
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{{
0
,
0
}});
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{{
0
,
1
}});
for
(
index_t
iter
=
0
;
iter
<
20
;
++
iter
)
{
printf
(
"iter %d
\n
"
,
iter
);
print_array_v2
(
"visible idx: "
,
in_gemmk_gemmn_coord
.
GetIndex
());
print_array_v2
(
"hidden idx: "
,
in_gemmk_gemmn_coord
.
GetHiddenIndex
());
printf
(
"offset: %d
\n
"
,
in_gemmk_gemmn_coord
.
GetOffset
());
printf
(
"
\n
"
);
move_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_step
);
}
}
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GridSize
=
1
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
dummy_transform
=
DummyDynamicTransform_2
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
dummy_transform
,
index_t
*
const
,
float
*
const
,
float
*
const
,
const
decltype
(
wei_kcyx_desc
),
const
decltype
(
in_nchw_desc
),
const
decltype
(
out_nkhw_desc
),
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
}
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform_3
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
const
auto
in_nchw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
InDesc
::
GetLengths
()),
to_array
(
InDesc
::
GetStrides
()));
const
auto
wei_kcyx_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
WeiDesc
::
GetLengths
()),
to_array
(
WeiDesc
::
GetStrides
()));
const
auto
out_nkhw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
OutDesc
::
GetLengths
()),
to_array
(
OutDesc
::
GetStrides
()));
const
auto
conv_strides
=
to_array
(
ConvStrides
{});
const
auto
conv_dilations
=
to_array
(
ConvDilations
{});
const
auto
in_left_pads
=
to_array
(
InLeftPads
{});
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
const
auto
tensor_descs
=
map_convolution_into_gemm_v2
(
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GridSize
=
1
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
dummy_transform
=
DummyDynamicTransform_3
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
dummy_transform
,
index_t
*
const
,
float
*
const
,
float
*
const
,
const
decltype
(
wei_kcyx_desc
),
const
decltype
(
in_nchw_desc
),
const
decltype
(
out_nkhw_desc
),
const
decltype
(
in_gemmk_gemmn_global_desc
),
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
in_gemmk_gemmn_global_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
}
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/device_dummy_dynamic_transform_v2.hpp
0 → 100644
View file @
fc7a6c85
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_dynamic_transform_v2.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform_v2
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
const
auto
in_nchw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
InDesc
::
GetLengths
()),
to_array
(
InDesc
::
GetStrides
()));
const
auto
wei_kcyx_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
WeiDesc
::
GetLengths
()),
to_array
(
WeiDesc
::
GetStrides
()));
const
auto
out_nkhw_desc
=
make_dynamic_native_tensor_descriptor_v2
(
to_array
(
OutDesc
::
GetLengths
()),
to_array
(
OutDesc
::
GetStrides
()));
const
auto
conv_strides
=
to_array
(
ConvStrides
{});
const
auto
conv_dilations
=
to_array
(
ConvDilations
{});
const
auto
in_left_pads
=
to_array
(
InLeftPads
{});
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
const
auto
tensor_descs
=
map_convolution_into_gemm_v2
(
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GridSize
=
1
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
dummy_transform
=
DummyDynamicTransform_v2
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
dummy_transform
,
index_t
*
const
,
float
*
const
,
float
*
const
,
const
decltype
(
wei_kcyx_desc
),
const
decltype
(
in_nchw_desc
),
const
decltype
(
out_nkhw_desc
),
const
decltype
(
in_gemmk_gemmn_global_desc
),
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
in_gemmk_gemmn_global_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
}
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/src/conv_driver.cpp
View file @
fc7a6c85
...
@@ -14,7 +14,8 @@
...
@@ -14,7 +14,8 @@
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dummy_static_transform.hpp"
#include "device_dummy_static_transform.hpp"
#include "device_dummy_dynamic_transform.hpp"
#include "device_dummy_dynamic_transform_v1.hpp"
#include "device_dummy_dynamic_transform_v2.hpp"
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
...
@@ -585,7 +586,7 @@ int main(int argc, char* argv[])
...
@@ -585,7 +586,7 @@ int main(int argc, char* argv[])
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif 0
#elif 0
device_dummy_dynamic_transform_1
(
in_nchw_desc
,
device_dummy_dynamic_transform_
v
1
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
...
@@ -597,19 +598,7 @@ int main(int argc, char* argv[])
...
@@ -597,19 +598,7 @@ int main(int argc, char* argv[])
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif 1
#elif 1
device_dummy_dynamic_transform_2
(
in_nchw_desc
,
device_dummy_dynamic_transform_v2
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 1
device_dummy_dynamic_transform_3
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment