Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e8d6434d
"vscode:/vscode.git/clone" did not exist on "6c60e430eeb050535aee8854c6ad649b162b62a9"
Commit
e8d6434d
authored
Jun 10, 2022
by
wangshaojie6
Browse files
1. remove comments. 2. add checkvalidity. 3. add gridsize computation
parent
f26fb605
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
37 additions
and
51 deletions
+37
-51
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
...nvnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+2
-39
include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
...e_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+11
-3
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
.../tensor_operation/gpu/device/device_unary_elementwise.hpp
+11
-9
include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
...nsor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
+13
-0
No files found.
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
View file @
e8d6434d
...
@@ -325,7 +325,6 @@ int main(int argc, char* argv[])
...
@@ -325,7 +325,6 @@ int main(int argc, char* argv[])
// alloc work space
// alloc work space
size_t
bwd_weight_workspace_size
=
conv
->
GetWorkSpaceSize
(
argument
.
get
());
size_t
bwd_weight_workspace_size
=
conv
->
GetWorkSpaceSize
(
argument
.
get
());
float
conv_ave_time
=
0.
f
;
float
conv_ave_time
=
0.
f
;
float
type_convert_ave_time
=
0.
f
;
DeviceMem
wei_work_space_device_buf
(
bwd_weight_workspace_size
);
DeviceMem
wei_work_space_device_buf
(
bwd_weight_workspace_size
);
wei_work_space_device_buf
.
SetZero
();
wei_work_space_device_buf
.
SetZero
();
...
@@ -341,42 +340,6 @@ int main(int argc, char* argv[])
...
@@ -341,42 +340,6 @@ int main(int argc, char* argv[])
conv_ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
conv_ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
#if 0
// do type convert
auto type_convert = DeviceUnaryElementwiseTypeConvertInstance{};
auto type_convert_invoker = type_convert.MakeInvokerPointer();
int tensor_size =
std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
auto type_convert_argument =
type_convert.MakeArgumentPointer(wei_work_space_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
{tensor_size},
{1},
{1},
UnaryTypeConvert{});
if(!type_convert.IsSupportedArgument(type_convert_argument.get()))
{
std::cout << "wrong! device_type_convert with the specified compilation parameters does "
"not support this convert problem"
<< std::endl;
return 1;
}
type_convert_ave_time =
type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
// type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
#endif
// host code to check if conv give me a right result
// Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
// ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
// wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
// const auto type_cvt_functor = [&](AccDataType a) {
// return ck::type_convert<WeiDataType, AccDataType>(a);
// };
// host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
// wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims,
// type_cvt_functor);
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
...
@@ -391,8 +354,8 @@ int main(int argc, char* argv[])
...
@@ -391,8 +354,8 @@ int main(int argc, char* argv[])
float
gb_per_sec
=
num_btype
/
1.E6
/
conv_ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
conv_ave_time
;
std
::
cout
<<
"Perf: conv: "
<<
conv_ave_time
<<
" ms,
type_convert: "
<<
type_convert_ave_time
std
::
cout
<<
"Perf: conv: "
<<
conv_ave_time
<<
" ms,
"
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
"
ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
"
GB/s"
<<
std
::
endl
;
<<
" GB/s"
<<
std
::
endl
;
if
(
do_verification
)
if
(
do_verification
)
{
{
...
...
include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
View file @
e8d6434d
...
@@ -1051,8 +1051,16 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
...
@@ -1051,8 +1051,16 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
int
tensor_size
=
int
tensor_size
=
std
::
accumulate
(
filter_dims
.
begin
(),
filter_dims
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
std
::
accumulate
(
filter_dims
.
begin
(),
filter_dims
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
GridDesc_M0
a_grid_desc_m0_
=
MakeDescriptor_M0
<
1
>
({
tensor_size
},
{
1
},
240
,
256
);
const
index_t
type_convert_grid_size
=
GridwiseUEltwise
::
CalculateGridSize
(
tensor_size
);
GridDesc_M0
b_grid_desc_m0_
=
MakeDescriptor_M0
<
1
>
({
tensor_size
},
{
1
},
240
,
256
);
GridDesc_M0
a_grid_desc_m0_
=
MakeDescriptor_M0
<
1
>
({
tensor_size
},
{
1
},
type_convert_grid_size
,
256
);
GridDesc_M0
b_grid_desc_m0_
=
MakeDescriptor_M0
<
1
>
({
tensor_size
},
{
1
},
type_convert_grid_size
,
256
);
if
(
!
GridwiseUEltwise
::
CheckValidity
(
a_grid_desc_m0_
,
b_grid_desc_m0_
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseUnaryElementwise_1D has invalid setting"
);
}
// run kernel for type conversion
// run kernel for type conversion
void
*
p_c_grid_tmp_
=
static_cast
<
void
*>
(
arg
.
p_c_grid_
);
void
*
p_c_grid_tmp_
=
static_cast
<
void
*>
(
arg
.
p_c_grid_
);
...
@@ -1061,7 +1069,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
...
@@ -1061,7 +1069,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
float
elapsed_time
=
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
dim3
(
240
),
dim3
(
type_convert_grid_size
),
dim3
(
256
),
dim3
(
256
),
0
,
0
,
static_cast
<
AccDataType
*>
(
arg
.
p_c_workspace_grid_
),
static_cast
<
AccDataType
*>
(
arg
.
p_c_workspace_grid_
),
...
...
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
View file @
e8d6434d
...
@@ -59,12 +59,12 @@ struct DeviceUnaryElementwise : public BaseOperator
...
@@ -59,12 +59,12 @@ struct DeviceUnaryElementwise : public BaseOperator
return
PadDescriptor_M0_1d
(
desc
,
gridSize
,
blockSize
);
return
PadDescriptor_M0_1d
(
desc
,
gridSize
,
blockSize
);
}
}
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
Gridwise
Bin
Eltwise
=
GridwiseUnaryElementwise_1D
<
ADataType
,
using
Gridwise
U
Eltwise
=
GridwiseUnaryElementwise_1D
<
ADataType
,
BDataType
,
BDataType
,
GridDesc_M0
,
GridDesc_M0
,
ElementwiseFunctor
,
ElementwiseFunctor
,
ScalarPerVector
>
;
ScalarPerVector
>
;
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
...
@@ -78,9 +78,11 @@ struct DeviceUnaryElementwise : public BaseOperator
...
@@ -78,9 +78,11 @@ struct DeviceUnaryElementwise : public BaseOperator
p_b_
(
p_b
),
p_b_
(
p_b
),
shape_
(
shape
),
shape_
(
shape
),
functor_
(
functor
),
functor_
(
functor
),
blockSize_
(
256
),
blockSize_
(
256
)
// FIXME - Calculate the grid size by number of CU in the future
gridSize_
(
240
)
// FIXME - Calculate the grid size by number of CU in the future
{
{
index_t
tensor_size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
gridSize_
=
GridwiseUEltwise
::
CalculateGridSize
(
tensor_size
);
a_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_a
,
gridSize_
,
blockSize_
);
a_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_a
,
gridSize_
,
blockSize_
);
b_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_b
,
gridSize_
,
blockSize_
);
b_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_b
,
gridSize_
,
blockSize_
);
}
}
...
@@ -99,7 +101,7 @@ struct DeviceUnaryElementwise : public BaseOperator
...
@@ -99,7 +101,7 @@ struct DeviceUnaryElementwise : public BaseOperator
{
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
const
auto
kernel
=
kernel_unary_elementwise_1d
<
Gridwise
Bin
Eltwise
,
const
auto
kernel
=
kernel_unary_elementwise_1d
<
Gridwise
U
Eltwise
,
ADataType
,
ADataType
,
BDataType
,
BDataType
,
GridDesc_M0
,
GridDesc_M0
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
View file @
e8d6434d
...
@@ -40,6 +40,19 @@ struct GridwiseUnaryElementwise_1D
...
@@ -40,6 +40,19 @@ struct GridwiseUnaryElementwise_1D
return
make_multi_index
(
global_thread_id
*
ScalarPerVector
);
return
make_multi_index
(
global_thread_id
*
ScalarPerVector
);
}
}
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
GridDesc_M0
a_grid_desc_m0
,
const
GridDesc_M0
b_grid_desc_m0
)
{
return
a_grid_desc_m0
.
GetLength
(
I0
)
==
b_grid_desc_m0
.
GetLength
(
I0
);
}
__host__
__device__
static
constexpr
index_t
CalculateGridSize
(
const
index_t
tensor_size
)
{
const
index_t
grid_size
=
math
::
integer_divide_ceil
(
tensor_size
,
256
*
ScalarPerVector
);
return
grid_size
;
}
__device__
static
void
Run
(
const
ADataType
*
__restrict__
p_a_global
,
__device__
static
void
Run
(
const
ADataType
*
__restrict__
p_a_global
,
BDataType
*
__restrict__
p_b_global
,
BDataType
*
__restrict__
p_b_global
,
const
GridDesc_M0
a_grid_desc_m0
,
const
GridDesc_M0
a_grid_desc_m0
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment