Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
d3405258
"configs/vscode:/vscode.git/clone" did not exist on "c78a4df9239e83aae7fbb51b9cc129a96f824cb6"
Commit
d3405258
authored
Jul 29, 2020
by
Chao Liu
Browse files
prototype dynamic descriptor
parent
834eb24c
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1172 additions
and
51 deletions
+1172
-51
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+502
-0
composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
...ernel/include/kernel_algorithm/dummy_static_transform.hpp
+19
-24
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+223
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+237
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
...e/tensor_description/dynamic_tensor_descriptor_helper.hpp
+35
-0
composable_kernel/include/utility/amd_llvm_intrinsic.hpp
composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+11
-0
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+1
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+2
-2
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+105
-0
driver/include/device_dummy_static_transform.hpp
driver/include/device_dummy_static_transform.hpp
+11
-11
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+26
-13
driver/src/conv_driver.cu
driver/src/conv_driver.cu
+0
-1
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
0 → 100644
View file @
d3405258
#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
namespace
ck
{
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform
{
__device__
void
Run_
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
,
index_t
k_block_num
,
index_t
c_block_num
,
index_t
y_block_num
,
index_t
x_block_num
)
const
{
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
p_wei_global
[
0
]
=
wei_k_c_y_x_global_desc
.
GetElementSize
();
p_wei_global
[
1
]
=
wei_k_c_y_x_global_desc
.
GetElementSpace
();
const
index_t
k_block_num_stride
=
c_block_num
*
y_block_num
*
x_block_num
;
const
index_t
c_block_num_stride
=
y_block_num
*
x_block_num
;
const
index_t
y_block_num_stride
=
x_block_num
;
index_t
tmp
=
get_block_1d_id
();
#if 0
const index_t k_block = tmp / k_block_num_stride;
tmp -= k_block * k_block_num_stride;
const index_t c_block = tmp / c_block_num_stride;
tmp -= c_block * c_block_num_stride;
const index_t y_block = tmp / y_block_num_stride;
tmp -= y_block * y_block_num_stride;
const index_t x_block = tmp;
#else
const
index_t
k_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
k_block_num_stride
);
tmp
-=
k_block
*
k_block_num_stride
;
const
index_t
c_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
c_block_num_stride
);
tmp
-=
c_block
*
c_block_num_stride
;
const
index_t
y_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
y_block_num_stride
);
tmp
-=
y_block
*
y_block_num_stride
;
const
index_t
x_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
);
#endif
const
index_t
k_thread
=
p_in_global
[
get_thread_local_1d_id
()];
const
index_t
c_thread
=
p_in_global
[
get_thread_local_1d_id
()
+
1
];
const
index_t
y_thread
=
p_in_global
[
get_thread_local_1d_id
()
+
2
];
const
index_t
x_thread
=
p_in_global
[
get_thread_local_1d_id
()
+
3
];
p_wei_global
[
3
]
=
wei_k_c_y_x_global_desc
.
CalculateOffset
(
{
k_block
+
k_thread
,
c_block
+
c_thread
,
y_block
+
y_thread
,
x_block
+
x_thread
});
}
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
,
index_t
,
index_t
,
index_t
,
index_t
)
const
{
#if 1
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
1
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
#else
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
3
;
const
index_t
X
=
3
;
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
const
index_t
InLeftPadH
=
in_left_pads
[
0
];
const
index_t
InLeftPadW
=
in_left_pads
[
1
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
#endif
// define transform
// pass through
auto
f_lower_idx_diff_passthrough
=
[](
index_t
&
idx_low_diff
,
const
index_t
&
idx_up_diff
)
{
idx_low_diff
=
idx_up_diff
;
};
// pad
auto
f_lower_idx_diff_pad
=
[](
index_t
&
idx_low_diff
,
const
index_t
&
idx_up_diff
)
{
idx_low_diff
=
idx_up_diff
;
};
// embed
auto
f_lower_idx_diff_embed
=
[](
index_t
&
idx_low_diff
,
const
index_t
&
idx_up_diff_0
,
const
index_t
&
idx_up_diff_1
,
const
index_t
coeff0
,
const
index_t
coeff1
)
{
idx_low_diff
=
coeff0
*
idx_up_diff_0
+
coeff1
*
idx_up_diff_1
;
};
// unmerge
auto
f_lower_idx_diff_unmerge
=
[](
index_t
&
idx_low_diff
,
const
index_t
&
idx_up_diff_0
,
const
index_t
&
idx_up_diff_1
,
const
index_t
up_length_1
)
{
idx_low_diff
=
up_length_1
*
idx_up_diff_0
+
idx_up_diff_1
;
};
// merge
auto
f_lower_idx_diff_merge_v1
=
[](
index_t
&
idx_low_diff_0
,
index_t
&
idx_low_diff_1
,
index_t
&
idx_low_diff_2
,
const
index_t
&
idx_up_diff
,
const
index_t
&
idx_low_old_0
,
const
index_t
&
idx_low_old_1
,
const
index_t
&
idx_low_old_2
,
const
index_t
&
idx_low_diff_const_0
,
const
index_t
&
idx_low_diff_const_1
,
const
index_t
&
idx_low_diff_const_2
,
const
index_t
&
idx_low_bound_0
,
const
index_t
&
idx_low_bound_1
,
const
index_t
&
idx_low_bound_2
)
{
auto
f_carry_arithmetic
=
[](
index_t
&
idx_low_diff
,
index_t
&
carry
,
const
index_t
&
idx_low_old
,
const
index_t
&
idx_low_diff_const
,
const
index_t
&
idx_low_bound
)
{
index_t
idx_low_tmp
=
idx_low_old
+
carry
+
idx_low_diff_const
;
#if 0 // positive
bool do_carry = idx_low_tmp >= idx_low_bound;
index_t idx_low_new = do_carry ? idx_low_tmp - idx_low_bound : idx_low_tmp;
carry = do_carry ? 1 : 0;
#else
// negative
bool
do_borrow
=
idx_low_tmp
<
0
;
index_t
idx_low_new
=
do_borrow
?
idx_low_tmp
+
idx_low_bound
:
idx_low_tmp
;
carry
=
do_borrow
?
-
1
:
0
;
#endif
idx_low_diff
=
idx_low_new
-
idx_low_old
;
};
index_t
carry
=
0
;
f_carry_arithmetic
(
idx_low_diff_2
,
carry
,
idx_low_old_2
,
idx_low_diff_const_2
,
idx_low_bound_2
);
f_carry_arithmetic
(
idx_low_diff_1
,
carry
,
idx_low_old_1
,
idx_low_diff_const_1
,
idx_low_bound_1
);
idx_low_diff_0
=
idx_low_diff_const_0
+
carry
;
};
auto
f_lower_idx_diff_merge_v2
=
[](
index_t
&
idx_low_diff_0
,
index_t
&
idx_low_diff_1
,
index_t
&
idx_low_diff_2
,
const
index_t
&
idx_up_diff
,
const
index_t
&
idx_low_old_0
,
const
index_t
&
idx_low_old_1
,
const
index_t
&
idx_low_old_2
,
const
index_t
&
idx_low_diff_const_0
,
const
index_t
&
idx_low_diff_const_1
,
const
index_t
&
idx_low_diff_const_2
,
const
index_t
&
idx_low_bound_0
,
const
index_t
&
idx_low_bound_1
,
const
index_t
&
idx_low_bound_2
)
{
auto
f_carry_arithmetic
=
[](
index_t
&
idx_low_diff
,
index_t
&
carry
,
const
index_t
&
idx_low_old
,
const
index_t
&
idx_low_diff_const
,
const
index_t
&
idx_low_bound
)
{
index_t
idx_low_tmp
=
idx_low_old
+
carry
;
index_t
idx_low_bound_minus_idx_low_diff_const
=
idx_low_bound
-
idx_low_diff_const
;
#if 1 // positive
bool
do_carry
=
idx_low_tmp
>=
idx_low_bound_minus_idx_low_diff_const
;
idx_low_diff
=
do_carry
?
-
idx_low_bound_minus_idx_low_diff_const
:
idx_low_diff_const
;
idx_low_diff
+=
carry
;
carry
=
do_carry
?
1
:
0
;
#else // negative
bool
do_borrow
=
idx_low_tmp
<
-
idx_low_diff_const
;
idx_low_diff
=
do_borrow
?
idx_low_diff_const
+
idx_low_bound
:
idx_low_diff_const
;
idx_low_diff
-=
carry
;
carry
=
do_borrow
?
1
:
carry
;
#endif
};
index_t
carry
=
0
;
f_carry_arithmetic
(
idx_low_diff_2
,
carry
,
idx_low_old_2
,
idx_low_diff_const_2
,
idx_low_bound_2
);
f_carry_arithmetic
(
idx_low_diff_1
,
carry
,
idx_low_old_1
,
idx_low_diff_const_1
,
idx_low_bound_1
);
idx_low_diff_0
=
idx_low_diff_const_0
+
carry
;
};
auto
f_lower_idx_diff_merge_v3
=
[](
index_t
&
idx_low_diff_0
,
index_t
&
idx_low_diff_1
,
index_t
&
idx_low_diff_2
,
const
index_t
&
idx_up_diff
,
const
index_t
&
idx_low_old_0
,
const
index_t
&
idx_low_old_1
,
const
index_t
&
idx_low_old_2
,
const
index_t
&
idx_low_diff_const_0
,
const
index_t
&
idx_low_diff_const_1
,
const
index_t
&
idx_low_diff_const_2
,
const
index_t
&
idx_low_bound_0
,
const
index_t
&
idx_low_bound_1
,
const
index_t
&
idx_low_bound_2
)
{
auto
f_carry_arithmetic
=
[](
index_t
&
idx_low_diff
,
index_t
&
negative_carry
,
const
index_t
&
idx_low_old
,
const
index_t
&
idx_low_diff_const
,
const
index_t
&
idx_low_bound
)
{
index_t
neg_idx_low_tmp
=
negative_carry
-
idx_low_old
;
index_t
idx_low_diff_const_minus_idx_low_bound
=
idx_low_diff_const
-
idx_low_bound
;
#if 1
bool
do_carry
=
neg_idx_low_tmp
<=
idx_low_diff_const_minus_idx_low_bound
;
idx_low_diff
=
do_carry
?
idx_low_diff_const_minus_idx_low_bound
:
idx_low_diff_const
;
idx_low_diff
-=
negative_carry
;
negative_carry
=
do_carry
?
-
1
:
0
;
#else
bool
do_borrow
=
neg_idx_low_tmp
>
idx_low_diff_const
;
idx_low_diff
=
do_borrow
?
idx_low_diff_const
+
idx_low_bound
:
idx_low_diff_const
;
idx_low_diff
-=
negative_carry
;
negative_carry
=
do_borrow
?
1
:
negative_carry
;
#endif
};
index_t
negative_carry
=
0
;
f_carry_arithmetic
(
idx_low_diff_2
,
negative_carry
,
idx_low_old_2
,
idx_low_diff_const_2
,
idx_low_bound_2
);
f_carry_arithmetic
(
idx_low_diff_1
,
negative_carry
,
idx_low_old_1
,
idx_low_diff_const_1
,
idx_low_bound_1
);
idx_low_diff_0
=
idx_low_diff_const_0
-
negative_carry
;
};
index_t
idx
[
20
];
index_t
idx_diff
[
20
];
index_t
const_tmp
[
6
];
// populate const
const
index_t
GemmKPack
=
p_wei_global
[
0
];
#if 1
for
(
index_t
i
=
0
;
i
<
6
;
++
i
)
{
const_tmp
[
i
]
=
p_wei_global
[
i
+
1
];
}
#else
const_tmp
[
0
]
=
0
;
const_tmp
[
1
]
=
2
;
const_tmp
[
2
]
=
2
;
#endif
// initialize idx
for
(
index_t
i
=
0
;
i
<
20
;
++
i
)
{
idx
[
i
]
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
// offset
idx
[
0
]
=
idx
[
1
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
0
)
+
idx
[
2
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
1
)
+
idx
[
3
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
2
)
+
idx
[
4
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
3
);
// start lowering diff
#pragma unroll 1
for
(
index_t
i
=
0
;
i
<
100
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
20
;
++
i
)
{
idx_diff
[
i
]
=
0
;
}
idx_diff
[
17
]
=
8
;
// stage 4
// Unmerge(GemmKTotal) => GemmK, GemmKPack
f_lower_idx_diff_unmerge
(
idx_diff
[
15
],
idx_diff
[
17
],
idx_diff
[
18
],
GemmKPack
);
// PassThrough GemmN => GemmN
f_lower_idx_diff_passthrough
(
idx_diff
[
16
],
idx_diff
[
19
]);
// stage 3
// Merge(C, Y, X) => GemmKTotal
f_lower_idx_diff_merge_v2
(
idx_diff
[
10
],
idx_diff
[
11
],
idx_diff
[
13
],
idx_diff
[
15
],
idx
[
10
],
idx
[
11
],
idx
[
13
],
const_tmp
[
0
],
const_tmp
[
1
],
const_tmp
[
2
],
C
,
Y
,
X
);
#if 0
// Merge(N, Ho, Wo) => GemmN
f_lower_idx_diff_merge(idx_diff[9],
idx_diff[12],
idx_diff[14],
idx_diff[16],
idx[9],
idx[12],
idx[14],
const_tmp[3],
const_tmp[4],
const_tmp[5],
N,
Ho,
Wo);
#endif
// stage 2
// PassThrough(N) => N
f_lower_idx_diff_passthrough
(
idx_diff
[
5
],
idx_diff
[
9
]);
// PassThrough(C) => C
f_lower_idx_diff_passthrough
(
idx_diff
[
6
],
idx_diff
[
10
]);
// Embed(Hip) => Y, Ho
f_lower_idx_diff_embed
(
idx_diff
[
7
],
idx_diff
[
11
],
idx_diff
[
12
],
ConvDilationH
,
ConvStrideH
);
// Embed(Wip) => X, Wo
f_lower_idx_diff_embed
(
idx_diff
[
8
],
idx_diff
[
13
],
idx_diff
[
14
],
ConvDilationW
,
ConvStrideW
);
// stage 1
// PassThrough(N) => N
f_lower_idx_diff_passthrough
(
idx_diff
[
1
],
idx_diff
[
5
]);
// PassThrough(C) => C
f_lower_idx_diff_passthrough
(
idx_diff
[
2
],
idx_diff
[
6
]);
// Pad(Hi) => Hip
f_lower_idx_diff_pad
(
idx_diff
[
3
],
idx_diff
[
7
]);
// Pad(Wi) => Wip
f_lower_idx_diff_pad
(
idx_diff
[
4
],
idx_diff
[
8
]);
// stage 0
// offset_diff
idx_diff
[
0
]
=
idx_diff
[
1
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
0
)
+
idx_diff
[
2
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
1
)
+
idx_diff
[
3
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
2
)
+
idx_diff
[
4
]
*
in_n_c_hi_wi_global_desc
.
GetStride
(
3
);
#if 0
// update idx
for(index_t i = 0; i < 20; ++ i)
{
idx[i] += idx_diff[i];
}
// padding check
bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
#elif
0
// no pad
// offset
idx
[
0
]
+=
idx_diff
[
0
];
// C, Y, X
idx
[
10
]
+=
idx_diff
[
10
];
idx
[
11
]
+=
idx_diff
[
11
];
idx
[
13
]
+=
idx_diff
[
13
];
// padding check
bool
is_in_bound
=
true
;
#else // pad
// offset
idx
[
0
]
+=
idx_diff
[
0
];
// C, Y, X
idx
[
10
]
+=
idx_diff
[
10
];
idx
[
11
]
+=
idx_diff
[
11
];
idx
[
13
]
+=
idx_diff
[
13
];
// Hi, Wi
idx
[
3
]
+=
idx_diff
[
3
];
idx
[
4
]
+=
idx_diff
[
4
];
// padding check
bool
is_in_bound
=
idx
[
3
]
>=
0
&&
idx
[
3
]
<
Hi
&&
idx
[
4
]
>=
0
&&
idx
[
4
]
<
Wi
;
#endif
float
value
=
1
;
transfer_data
<
float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_out_global
,
idx
[
0
],
is_in_bound
,
out_n_k_ho_wo_global_desc
.
GetElementSpace
());
}
}
};
}
// namespace ck
#endif
composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
View file @
d3405258
...
...
@@ -96,31 +96,26 @@ struct DummyStaticTransform
auto
coord
=
typename
TensorCoordinate
<
decltype
(
in_gemmk_gemmn_global_desc
)
>::
type
(
k0
,
n0
);
if
(
get_block_1d_id
()
<
coord
.
GetOffset
())
#pragma unroll 1
for
(
index_t
k
=
0
;
k
<
100
;
++
k
)
{
for
(
index_t
k
=
0
;
k
<
1
;
++
k
)
{
for
(
index_t
n
=
0
;
n
<
4
;
++
n
)
{
auto
tmp
=
coord
+
Array
<
index_t
,
2
>
{
k
,
n
};
Float
value
=
1
;
transfer_data
<
Float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_in_global
,
tmp
.
GetOffset
(),
tmp
.
IsOffsetValidAssumingUpperIndexIsValid
(),
in_gemmk_gemmn_global_desc
.
GetElementSpace
());
}
}
coord
+=
Array
<
index_t
,
2
>
{
8
,
0
};
Float
value
=
1
;
transfer_data
<
Float
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
&
value
,
0
,
true
,
1
,
p_in_global
,
coord
.
GetOffset
(),
coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
in_gemmk_gemmn_global_desc
.
GetElementSpace
());
}
}
};
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
0 → 100644
View file @
d3405258
#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
namespace
ck
{
struct
DynamicPassThrough
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
index_t
low_length_
;
__host__
__device__
constexpr
DynamicPassThrough
(
index_t
low_length
)
:
low_length_
(
low_length
)
{
}
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
};
template
<
index_t
NDimLow
>
struct
DynamicMerge
{
static
constexpr
index_t
ndim_low_
=
NDimLow
static
constexpr
index_t
ndim_up_
=
1
;
using
LowerIndex
=
MultiIndex
<
ndim_low_
>
;
using
UpperIndex
=
MultiIndex
<
ndum_up_
>
;
Array
<
index_t
,
NDimLow
>
low_lengths_
;
index_t
up_length_
;
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
ndim_low_
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
ndim_up_
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Array
<
index_t
,
1
>
up_length_
;
}
// emulate constexpr lambda
template
<
typename
PseudoLowStrides
>
struct
lambda_CalculateLowerIndex
{
index_t
&
itmp
;
LowerIndex
&
idx_low
;
__host__
__device__
explicit
constexpr
lambda_CalculateLowerIndex
(
index_t
&
itmp_
,
LowerIndex
&
idx_low_
)
:
itmp
(
itmp_
),
idx_low
(
idx_low_
)
{
}
template
<
typename
IDim
>
__host__
__device__
constexpr
void
operator
()(
IDim
idim
)
const
{
constexpr
index_t
stride
=
PseudoLowStrides
::
At
(
idim
);
idx_low
(
idim
)
=
itmp
/
stride
;
itmp
-=
idx_low
[
idim
]
*
stride
;
}
};
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
;
index_t
itmp
=
idx_up
[
0
];
constexpr
auto
pseudo_low_strides
=
reverse_inclusive_scan_sequence
(
LowerLengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
idx_low
(
nDimLow
-
1
)
=
itmp
/
pseudo_low_strides
[
nDimLow
-
1
];
return
idx_low
;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
if
(
idx_up_diff
[
0
]
==
0
)
{
return
make_zero_array
<
index_t
,
nDimLow
>
();
}
else
{
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// If idx_up_diff is known at compile-time, the calculation can
// be done at compile-time. However, if idx_up_diff is only known
// at run-time, then the calculation will also be computed at
// run-time, and can be very expensive.
LowerIndex
idx_low_diff_tmp
=
CalculateLowerIndex
(
idx_up_diff
);
// find out the last low dimension that changed
index_t
last_changed_low_dim
=
0
;
static_for
<
0
,
nDimLow
,
1
>
{}([
&
](
auto
i
)
{
if
(
idx_low_diff_tmp
[
i
]
!=
0
)
{
last_changed_low_dim
=
i
;
}
});
LowerIndex
idx_low_new
=
idx_low_old
+
idx_low_diff_tmp
;
if
(
idx_up_diff
[
0
]
>
0
)
{
// do carry check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool
carry
=
false
;
static_for
<
nDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
if
(
i
<=
last_changed_low_dim
)
{
if
(
carry
)
{
++
idx_low_new
(
i
);
}
carry
=
false
;
if
(
idx_low_new
[
i
]
>=
LowerLengths
::
At
(
i
))
{
idx_low_new
(
i
)
-=
LowerLengths
::
At
(
i
);
carry
=
true
;
}
}
});
// highest dimension, no out-of-bound check
if
(
carry
)
{
++
idx_low_new
(
0
);
}
}
else
{
// do borrow check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool
borrow
=
false
;
static_for
<
nDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
if
(
i
<=
last_changed_low_dim
)
{
if
(
borrow
)
{
--
idx_low_new
(
i
);
}
borrow
=
false
;
if
(
idx_low_new
[
i
]
<
0
)
{
idx_low_new
(
i
)
+=
LowerLengths
::
At
(
i
);
borrow
=
true
;
}
}
});
// highest dimension, no out-of-bound check
if
(
borrow
)
{
--
idx_low_new
(
0
);
}
}
return
idx_low_new
-
idx_low_old
;
}
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
0 → 100644
View file @
d3405258
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp"
namespace
ck
{
template
<
index_t
NDim
>
struct
DynamicNativeTensorDescriptor
{
using
Index
=
MultiIndex
<
NDim
>
;
Array
<
index_t
,
NDim
>
lengths_
;
Array
<
index_t
,
NDim
>
strides_
;
index_t
element_size_
;
index_t
element_space_
;
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
DynamicNativeTensorDescriptor
(
const
Lengths
&
lengths
,
const
Strides
&
strides
)
:
lengths_
(
lengths
),
strides_
(
strides
)
{
element_size_
=
1
;
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
element_size_
*=
lengths_
[
i
];
}
element_space_
=
1
;
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
element_space_
+=
(
lengths_
[
i
]
-
1
)
*
strides_
[
i
];
}
}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
NDim
;
}
__host__
__device__
constexpr
auto
GetLength
(
const
index_t
&
i
)
const
{
return
lengths_
[
i
];
}
__host__
__device__
constexpr
auto
GetStride
(
const
index_t
&
i
)
const
{
return
strides_
[
i
];
}
__host__
__device__
constexpr
auto
GetLengths
()
const
{
return
lengths_
;
}
__host__
__device__
constexpr
auto
GetStrides
()
const
{
return
strides_
;
}
__host__
__device__
constexpr
auto
GetElementSize
()
const
{
return
element_size_
;
}
__host__
__device__
constexpr
auto
GetElementSpace
()
const
{
return
element_space_
;
}
__host__
__device__
constexpr
auto
CalculateOffset
(
const
Index
&
idx
)
const
{
index_t
offset
=
0
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
offset
+=
idx
[
i
]
*
strides_
[
i
];
}
return
offset
;
}
__host__
__device__
constexpr
auto
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
const
{
index_t
offset_diff
=
0
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
offset_diff
+=
idx_diff
[
i
]
*
strides_
[
i
];
}
return
offset_diff
;
}
__host__
__device__
constexpr
bool
IsUpperIndexValid
(
const
Index
&
idx
)
const
{
bool
flag
=
true
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
flag
=
flag
&&
idx
[
i
]
>=
0
&&
idx
[
i
]
<
lengths_
[
i
];
}
return
flag
;
}
};
#if 0
// Tensor descriptor for "transformed tensor"
template <typename LowTensorDescriptor,
typename Transforms, // Tuple<DynamicMultIndexTransforms,...>
typename LowDimensions, // Tuple<Sequence<...>,...>
typename UpDimensions> // Tuple<Sequence<...>,...>
struct DynamicTransformedTensorDescriptor
{
using Type = DynamicTransformedTensorDescriptor;
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
// Here, we assume all lower-dimensions are active
// TODO: sanity-check all lower-dimension are indeed active
using duplicated_low_active_dims =
decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return low_active_dims::Size();
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
using duplicated_up_active_dims =
decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return up_active_dims::Size();
}
static constexpr index_t ndim_up_ = GetNumOfUpperDimension();
static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
static constexpr index_t num_transform_ = Transforms::Size();
using UpperIndex = MultiIndex<ndim_up_>;
using LowerIndex = MultiIndex<ndim_low_>;
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
const LowDimensions low_dims_;
const UpDimensions up_dims_;
__host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms)
: low_tensor_desc_(low_tensor_desc),
transforms_(transforms)
{
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const
{
return low_dims_;
}
__host__ __device__ constexpr auto GetUpperLengths() cons
{
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr auto GetLength(index_t i) const
{
return GetLengths()[i];
}
__host__ __device__ constexpr auto GetElementSize() const
{
index_t element_size = 1;
for(index_t i = 0; i < ndim_up_; ++i)
{
element_size *= GetLength(i);
}
return element_size;
}
__host__ __device__ constexpr auto GetElementSpace() const
{
return lower_tensor_desc_.GetElementSpace();
}
// TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
{
LowerIndex idx_low;
static_for<0, num_transform_, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensions{}.At(itran));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
});
return idx_low;
}
// TODO: right now return value is not constexpr because use of non-constepxr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
{
LowerIndex idx_low_diff;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensions{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part = tran.CalculateLowerIndexDiff(
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
});
return idx_low_diff;
}
__host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
{
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
}
};
#endif
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
0 → 100644
View file @
d3405258
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
namespace
ck
{
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
auto
make_dynamic_native_tensor_descriptor
(
const
Lengths
&
lengths
,
const
Strides
&
strides
)
{
static_assert
(
Lengths
::
GetSize
()
==
Strides
::
GetSize
(),
"wrong! Size not the same"
);
return
DynamicNativeTensorDescriptor
<
Lengths
::
GetSize
()
>
(
lengths
,
strides
);
}
template
<
typename
LowTensorDescriptor
,
typename
Transforms
,
typename
LowDimensions
,
typename
UpDimensions
>
__host__
__device__
constexpr
auto
transform_dynamic_tensor_descriptor
(
const
LowTensorDescriptor
&
low_tensor_desc
,
const
Transforms
&
transforms
,
LowDimensions
,
UpDimensions
)
{
return
DynamicTransformedTensorDescriptor
<
LowTensorDescriptor
,
Transforms
,
LowDimensions
,
UpDimensions
>
(
low_tensor_desc
,
transforms
);
}
}
// namespace ck
#endif
composable_kernel/include/utility/amd_llvm_intrinsic.hpp
0 → 100644
View file @
d3405258
#ifndef CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP
#include "float_type.hpp"
namespace
ck
{
__device__
int32_t
__llvm_amdgcn_readfirstlane_i32
(
int32_t
i
)
__asm
(
"llvm.amdgcn.readfirstlane"
);
}
// namespace ck
#endif
composable_kernel/include/utility/common_header.hpp
View file @
d3405258
...
...
@@ -20,6 +20,7 @@
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#include "amd_llvm_intrinsic.hpp"
#endif
#if CK_USE_AMD_XDLOPS
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
d3405258
...
...
@@ -172,7 +172,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif
0
#elif
1
// cdata = 64, BlockSize = 256, 128x128x16
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -290,7 +290,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif
1
#elif
0
// cdata = 64, BlockSize = 128, 64x128x8
constexpr
index_t
BlockSize
=
128
;
...
...
driver/include/device_dummy_dynamic_transform.hpp
0 → 100644
View file @
d3405258
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_dynamic_transform.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
const
auto
in_nchw_desc
=
make_dynamic_native_tensor_descriptor
(
to_array
(
InDesc
::
GetLengths
()),
to_array
(
InDesc
::
GetStrides
()));
const
auto
wei_kcyx_desc
=
make_dynamic_native_tensor_descriptor
(
to_array
(
WeiDesc
::
GetLengths
()),
to_array
(
WeiDesc
::
GetStrides
()));
const
auto
out_nkhw_desc
=
make_dynamic_native_tensor_descriptor
(
to_array
(
OutDesc
::
GetLengths
()),
to_array
(
OutDesc
::
GetStrides
()));
const
auto
conv_strides
=
to_array
(
ConvStrides
{});
const
auto
conv_dilations
=
to_array
(
ConvDilations
{});
const
auto
in_left_pads
=
to_array
(
InLeftPads
{});
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GridSize
=
1
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
dummy_transform
=
DummyDynamicTransform
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
dummy_transform
,
index_t
*
const
,
index_t
*
const
,
float
*
const
,
const
DynamicNativeTensorDescriptor
<
4
>
,
const
DynamicNativeTensorDescriptor
<
4
>
,
const
DynamicNativeTensorDescriptor
<
4
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
index_t
,
index_t
,
index_t
,
index_t
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
index_t
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
10
,
10
,
10
,
10
);
}
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/device_dummy_transform.hpp
→
driver/include/device_dummy_
static_
transform.hpp
View file @
d3405258
...
...
@@ -12,17 +12,17 @@ template <class T,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_transform
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
void
device_dummy_
static_
transform
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
driver/src/conv_driver.cpp
View file @
d3405258
...
...
@@ -14,7 +14,8 @@
#include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dummy_transform.hpp"
#include "device_dummy_static_transform.hpp"
#include "device_dummy_dynamic_transform.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
@@ -200,7 +201,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
288
;
...
...
@@ -572,18 +573,30 @@ int main(int argc, char* argv[])
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 0
device_dummy_static_transform
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 1
device_dummy_transform
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
device_dummy_
dynamic_
transform
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#endif
if
(
do_verification
)
...
...
driver/src/conv_driver.cu
deleted
120000 → 0
View file @
834eb24c
conv_driver
.
cpp
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment