Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ef326c73
Commit
ef326c73
authored
Nov 19, 2024
by
Alan Turner
Browse files
Merge remote-tracking branch 'origin/develop' into migraphx-update
parents
b7775add
e4dfe4d8
Changes
513
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
1798 additions
and
0 deletions
+1798
-0
example/62_convnd_activ/convscale/run_convnd_fwd_convscale_example.inc
...nvnd_activ/convscale/run_convnd_fwd_convscale_example.inc
+104
-0
example/62_convnd_activ/convscale_add/CMakeLists.txt
example/62_convnd_activ/convscale_add/CMakeLists.txt
+11
-0
example/62_convnd_activ/convscale_add/convnd_fwd_convscale_add_common.hpp
...d_activ/convscale_add/convnd_fwd_convscale_add_common.hpp
+315
-0
example/62_convnd_activ/convscale_add/convnd_fwd_xdl_convscale_add_fp8.cpp
..._activ/convscale_add/convnd_fwd_xdl_convscale_add_fp8.cpp
+87
-0
example/62_convnd_activ/convscale_add/run_convnd_fwd_convscale_add_example.inc
...iv/convscale_add/run_convnd_fwd_convscale_add_example.inc
+104
-0
example/62_convnd_activ/convscale_reduce/CMakeLists.txt
example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+14
-0
example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
...v/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
+502
-0
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
...iv/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
+82
-0
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
...nvscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
+82
-0
example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc
..._convnd_activ/convscale_reduce/run_convnd_fwd_example.inc
+98
-0
example/62_convnd_activ/convscale_relu/CMakeLists.txt
example/62_convnd_activ/convscale_relu/CMakeLists.txt
+11
-0
example/62_convnd_activ/convscale_relu/convnd_fwd_convscale_relu_common.hpp
...activ/convscale_relu/convnd_fwd_convscale_relu_common.hpp
+302
-0
example/62_convnd_activ/convscale_relu/convnd_fwd_xdl_convscale_relu_fp8.cpp
...ctiv/convscale_relu/convnd_fwd_xdl_convscale_relu_fp8.cpp
+86
-0
No files found.
Too many changes to show.
To preserve performance only
513 of 513+
files are displayed.
Plain diff
Email patch
example/62_convnd_activ/convscale/run_convnd_fwd_convscale_example.inc
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool
run_convnd_fwd_example
(
int
argc
,
char
*
argv
[])
{
print_helper_msg
();
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
2
,
1
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}};
if
(
argc
==
1
)
{
// use default
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
5
,
argv
);
}
// instantiate in and wei element ops, will
// instantiate out_element_op below for every iteration
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
run
=
[
&
](
auto
ndim_spatial
,
auto
in_layout
,
auto
wei_layout
,
auto
ds_layout
,
auto
out_layout
)
{
constexpr
ck
::
index_t
ndim_spatial_value
=
ndim_spatial
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
DsLayout
=
decltype
(
ds_layout
);
using
OutLayout
=
decltype
(
out_layout
);
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
return
run_grouped_conv_fwd
<
ndim_spatial_value
,
InDataType
,
WeiDataType
,
CShuffleDataType
,
DsDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
DeviceGroupedConvNDFwdInstance
<
ndim_spatial_value
,
InLayout
,
WeiLayout
,
DsLayout
,
OutLayout
>>
(
do_verification
,
init_method
,
time_kernel
,
conv_param
,
in_g_n_c_wis_desc
,
wei_g_k_c_xs_desc
,
out_g_n_k_wos_desc
,
in_element_op
,
wei_element_op
);
};
namespace
ctc
=
ck
::
tensor_layout
::
convolution
;
if
(
conv_param
.
num_dim_spatial_
==
1
)
{
return
run
(
ck
::
Number
<
1
>
{},
ctc
::
GNWC
{},
ctc
::
GKXC
{},
ck
::
Tuple
<>
{},
ctc
::
GNWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
2
)
{
return
run
(
ck
::
Number
<
2
>
{},
ctc
::
GNHWC
{},
ctc
::
GKYXC
{},
ck
::
Tuple
<>
{},
ctc
::
GNHWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
3
)
{
return
run
(
ck
::
Number
<
3
>
{},
ctc
::
GNDHWC
{},
ctc
::
GKZYXC
{},
ck
::
Tuple
<>
{},
ctc
::
GNDHWK
{});
}
return
true
;
}
example/62_convnd_activ/convscale_add/CMakeLists.txt
0 → 100644
View file @
ef326c73
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_convnd_activ_xdl_convscale_add
)
add_example_executable
(
example_convnd_fwd_xdl_convscale_add_fp8 convnd_fwd_xdl_convscale_add_fp8.cpp
)
add_example_dependencies
(
example_convnd_activ_xdl_convscale_add example_convnd_fwd_xdl_convscale_add_fp8
)
set
(
target 1
)
endif
()
endforeach
()
example/62_convnd_activ/convscale_add/convnd_fwd_convscale_add_common.hpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ConvScaleAdd
=
ck
::
tensor_operation
::
element_wise
::
ConvScaleAdd
;
void
print_helper_msg
()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: time kernel (0=no, 1=yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_rtol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
1e-1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
1.5e-1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_atol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
16.1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
8192.1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
ck
::
index_t
NumDimSpatial
,
ck
::
index_t
NumNonSpatialDim
=
3
>
std
::
size_t
GetFlops
(
const
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
NumNonSpatialDim
>&
output_lengths
,
const
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
NumNonSpatialDim
>&
weights_lengths
,
const
std
::
size_t
&
ds_size
)
{
// G * N * C * <output spatial lengths product> * (2 * K * <filter spatial lengths product> +
// <number of scale factors>)
ck
::
index_t
G
=
weights_lengths
[
0
];
ck
::
index_t
N
=
output_lengths
[
1
];
ck
::
index_t
K
=
weights_lengths
[
1
];
ck
::
index_t
C
=
weights_lengths
[
2
];
return
G
*
N
*
C
*
std
::
accumulate
(
std
::
next
(
std
::
begin
(
output_lengths
),
NumNonSpatialDim
),
std
::
end
(
output_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<>
())
*
(
static_cast
<
std
::
size_t
>
(
2
)
*
K
*
std
::
accumulate
(
std
::
next
(
std
::
begin
(
weights_lengths
),
NumNonSpatialDim
),
std
::
end
(
weights_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<>
())
+
ds_size
);
}
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv_fwd
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
)
{
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
DsDataType
>
bias
(
out_g_n_k_wos_desc
);
Tensor
<
CShuffleDataType
>
c
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"bias: "
<<
bias
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
1
,
1
});
bias
.
GenerateTensorValue
(
GeneratorTensor_2
<
DsDataType
>
{
-
3
,
3
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
5.0
,
5.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
1.0
,
1.0
});
bias
.
GenerateTensorValue
(
GeneratorTensor_3
<
DsDataType
>
{
-
3.0
,
3.0
});
break
;
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bias_device_buf
(
sizeof
(
DsDataType
)
*
bias
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
bias_device_buf
.
ToDevice
(
bias
.
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
d_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
d_g_n_k_wos_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// random scale values
float
scale_in
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
float
scale_wei
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
float
scale_out
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"scale_in: "
<<
scale_in
<<
std
::
endl
;
std
::
cout
<<
"scale_wei: "
<<
scale_wei
<<
std
::
endl
;
std
::
cout
<<
"scale_out: "
<<
scale_out
<<
std
::
endl
;
// initialize out_element_op for each iteration
const
auto
out_element_op
=
OutElementOp
{
scale_in
,
scale_wei
,
scale_out
};
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
bias_device_buf
.
GetDeviceBuffer
()},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
1
>
{{
d_g_n_k_wos_lengths
}},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
1
>
{{
d_g_n_k_wos_strides
}},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
ds_size
=
3
+
1
;
// 3 element-wise scale multipliers + 1 element-wise add
std
::
size_t
flop
=
GetFlops
<
NDimSpatial
>
(
e_g_n_k_wos_lengths
,
b_g_k_c_xs_lengths
,
ds_size
);
std
::
size_t
num_btype
=
conv_param
.
GetInputByte
<
InDataType
>
()
+
conv_param
.
GetWeightByte
<
WeiDataType
>
()
+
sizeof
(
float
)
+
sizeof
(
float
)
+
sizeof
(
float
)
+
conv_param
.
GetOutputByte
<
OutDataType
>
()
+
conv_param
.
GetOutputByte
<
DsDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
CShuffleDataType
,
InElementOp
,
WeiElementOp
,
PassThrough
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
c
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
out_host
.
ForEach
(
[
&
](
auto
&
,
auto
idx
)
{
out_element_op
(
out_host
(
idx
),
c
(
idx
),
bias
(
idx
));
});
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
,
get_rtol
<
OutDataType
>
(),
get_atol
<
OutDataType
>
());
}
return
true
;
}
example/62_convnd_activ/convscale_add/convnd_fwd_xdl_convscale_add_fp8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/tuple.hpp"
#include "convnd_fwd_convscale_add_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
DsDataType
=
float
;
using
OutDataType
=
ck
::
f8_t
;
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ConvScaleAdd
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
DsLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
DsLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<
DsDataType
>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
AComputeDataType
,
BComputeDataType
>
;
#include "run_convnd_fwd_convscale_add_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_example
(
argc
,
argv
)
?
0
:
1
;
}
example/62_convnd_activ/convscale_add/run_convnd_fwd_convscale_add_example.inc
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool
run_convnd_fwd_example
(
int
argc
,
char
*
argv
[])
{
print_helper_msg
();
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
2
,
1
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}};
if
(
argc
==
1
)
{
// use default
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
5
,
argv
);
}
// instantiate in and wei element ops, will
// instantiate out_element_op below for every iteration
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
run
=
[
&
](
auto
ndim_spatial
,
auto
in_layout
,
auto
wei_layout
,
auto
ds_layout
,
auto
out_layout
)
{
constexpr
ck
::
index_t
ndim_spatial_value
=
ndim_spatial
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
DsLayout
=
decltype
(
ds_layout
);
using
OutLayout
=
decltype
(
out_layout
);
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
return
run_grouped_conv_fwd
<
ndim_spatial_value
,
InDataType
,
WeiDataType
,
CShuffleDataType
,
DsDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
DeviceGroupedConvNDFwdInstance
<
ndim_spatial_value
,
InLayout
,
WeiLayout
,
DsLayout
,
OutLayout
>>
(
do_verification
,
init_method
,
time_kernel
,
conv_param
,
in_g_n_c_wis_desc
,
wei_g_k_c_xs_desc
,
out_g_n_k_wos_desc
,
in_element_op
,
wei_element_op
);
};
namespace
ctc
=
ck
::
tensor_layout
::
convolution
;
if
(
conv_param
.
num_dim_spatial_
==
1
)
{
return
run
(
ck
::
Number
<
1
>
{},
ctc
::
GNWC
{},
ctc
::
GKXC
{},
ctc
::
GNWK
{},
ctc
::
GNWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
2
)
{
return
run
(
ck
::
Number
<
2
>
{},
ctc
::
GNHWC
{},
ctc
::
GKYXC
{},
ctc
::
GNHWK
{},
ctc
::
GNHWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
3
)
{
return
run
(
ck
::
Number
<
3
>
{},
ctc
::
GNDHWC
{},
ctc
::
GKZYXC
{},
ctc
::
GNDHWK
{},
ctc
::
GNDHWK
{});
}
return
true
;
}
example/62_convnd_activ/convscale_reduce/CMakeLists.txt
0 → 100644
View file @
ef326c73
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_convnd_activ_xdl_convscale_reduce
)
add_example_executable
(
example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
)
add_example_dependencies
(
example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8
)
add_example_executable
(
example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp
)
add_example_dependencies
(
example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8
)
set
(
target 1
)
endif
()
endforeach
()
example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/utility/type.hpp"
namespace
ew
=
ck
::
tensor_operation
::
element_wise
;
using
PassThrough
=
ew
::
PassThrough
;
using
ConvScaleRelu
=
ew
::
UnaryCombinedOp
<
ew
::
Scale
,
ew
::
Scale
,
ew
::
Relu
>
;
using
ConvScale
=
ew
::
UnaryCombinedOp
<
ew
::
Scale
,
ew
::
Scale
,
PassThrough
>
;
using
UnaryScaleConvert
=
ew
::
Scale
;
void
print_helper_msg
()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: time kernel (0=no, 1=yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_rtol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
1e-1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
1.5e-1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_atol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
16.1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
8192.1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
ConvOutDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
ConvElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv_fwd
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
)
{
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
ConvOutDataType
>
host_conv
(
out_g_n_k_wos_desc
);
Tensor
<
ConvOutDataType
>
device_conv
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
case
11
:
// used for debugging
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
1
});
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
conv_device_buf
(
conv_param
.
GetOutputByte
<
ConvOutDataType
>
());
DeviceMem
out_device_buf
(
conv_param
.
GetOutputByte
<
OutDataType
>
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// random scale values
float
scale_in
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
float
scale_wei
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
float
scale_out
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"scale_in: "
<<
scale_in
<<
std
::
endl
;
std
::
cout
<<
"scale_wei: "
<<
scale_wei
<<
std
::
endl
;
std
::
cout
<<
"scale_out: "
<<
scale_out
<<
std
::
endl
;
// convolution elementwise operation
auto
conv_element_op
=
ConvElementOp
{
ew
::
Scale
{
scale_in
},
ew
::
Scale
{
scale_wei
},
{}};
auto
scale_convert
=
UnaryScaleConvert
{
scale_out
};
// elementwise scale and type cast
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
conv_invoker
=
conv
.
MakeInvoker
();
auto
conv_argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{},
conv_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
conv_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
conv_argument
))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
std
::
string
kernels
=
conv
.
GetTypeString
();
float
avg_time
=
conv_invoker
.
Run
(
conv_argument
,
StreamConfig
{
nullptr
,
time_kernel
});
using
DeviceElementwiseScale
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ConvOutDataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
OutDataType
>
,
// OutDataTypeTuple
UnaryScaleConvert
,
// UnaryScaleConvert
NDimSpatial
+
3
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
>>
;
// OutScalarPerVectorSeq
auto
device_ew_scale
=
DeviceElementwiseScale
{};
auto
scale_invoker
=
device_ew_scale
.
MakeInvoker
();
auto
scale_argument
=
device_ew_scale
.
MakeArgument
(
e_g_n_k_wos_lengths
,
{
e_g_n_k_wos_strides
},
{
e_g_n_k_wos_strides
},
{
conv_device_buf
.
GetDeviceBuffer
()},
{
out_device_buf
.
GetDeviceBuffer
()},
scale_convert
);
if
(
!
device_ew_scale
.
IsSupportedArgument
(
scale_argument
))
{
throw
std
::
runtime_error
(
"wrong! DeviceElementwiseScale with the specified compilation parameters does "
"not support this problem"
);
}
kernels
+=
std
::
string
(
"
\n\t\t
"
)
+
device_ew_scale
.
GetTypeString
();
avg_time
+=
scale_invoker
.
Run
(
scale_argument
,
StreamConfig
{
nullptr
,
time_kernel
});
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AMAX
;
using
ReduceOperation
=
typename
ck
::
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
ck
::
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
ck
::
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceReduceMultiBlock
<
ConvOutDataType
,
ConvOutDataType
,
ConvOutDataType
,
NDimSpatial
+
3
,
NDimSpatial
+
3
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
ck
::
InMemoryDataOperationEnum
::
Set
,
true
,
// PropagateNan
false
,
// OutputIndex
false
,
// HaveIndexInputIfOutputIndex
256
,
// BlockSize
4
,
// MThreadClusterSize
64
,
// KThreadClusterSize
1
,
// MThreadSliceSize
1
,
// KThreadSliceSize
1
,
// InSrcVectorDim
1
,
// InSrceVectorSize
1
>
;
// OutDstVectorSize
std
::
vector
<
size_t
>
outLengths
=
{
1
};
Tensor
<
ConvOutDataType
>
amax_host
(
outLengths
);
Tensor
<
ConvOutDataType
>
amax_from_device
(
outLengths
);
auto
amax_host_strides
=
amax_host
.
mDesc
.
GetStrides
();
std
::
array
<
int
,
NDimSpatial
+
3
>
reduce_dims
;
std
::
iota
(
reduce_dims
.
begin
(),
reduce_dims
.
end
(),
0
);
// 0,..., NDimSpatial+3-1
std
::
array
<
ck
::
index_t
,
1
>
reduce_out_lengths
{
1
};
std
::
array
<
ck
::
index_t
,
1
>
reduce_out_strides
{
static_cast
<
ck
::
index_t
>
(
amax_host_strides
[
0
])};
DeviceMem
amax_device
(
sizeof
(
ConvOutDataType
)
*
amax_host
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
index_device
;
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
std
::
tie
(
in_elementwise_op
,
acc_elementwise_op
)
=
ck
::
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
static_cast
<
int32_t
>
(
host_conv
.
mDesc
.
GetElementSize
()));
// Hack convolution output strides for reduction as kernel expects stride 1 for the last
// dimension. It only works because the reduction is done on the whole tensor and result is
// independent of the order of elements.
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
reduction_strides
{};
copy
(
HostTensorDescriptor
(
e_g_n_k_wos_lengths
).
GetStrides
(),
reduction_strides
);
auto
device_reduce
=
DeviceReduceInstance
{};
auto
reduce_invoker
=
device_reduce
.
MakeInvokerPointer
();
auto
reduce_argument
=
device_reduce
.
MakeArgumentPointer
(
e_g_n_k_wos_lengths
,
reduction_strides
,
reduce_out_lengths
,
reduce_out_strides
,
reduce_dims
,
1.0
,
0.0
,
conv_device_buf
.
GetDeviceBuffer
(),
nullptr
,
amax_device
.
GetDeviceBuffer
(),
nullptr
,
in_elementwise_op
,
acc_elementwise_op
);
if
(
!
device_reduce
.
IsSupportedArgument
(
reduce_argument
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! DeviceReduceInstance with the specified compilation parameters does "
"not support this runtime parameters!"
);
};
kernels
+=
std
::
string
(
"
\n\t\t
"
)
+
device_reduce
.
GetTypeString
();
float
reduce_time
=
reduce_invoker
->
Run
(
reduce_argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
if
(
time_kernel
)
std
::
cout
<<
"
\n
Reduce time: "
<<
reduce_time
<<
" ms"
<<
std
::
endl
;
avg_time
+=
reduce_time
;
std
::
size_t
flop
=
conv_param
.
GetFlops
();
// convolution FLOPs
auto
conv_out_elems
=
host_conv
.
GetElementSize
();
// number of elements in conv result tensor
// 3 element-wise scale multipliers + 1 AMAX
std
::
size_t
elementwise_ops
=
3
+
1
;
if
constexpr
(
ck
::
is_same_v
<
ConvElementOp
,
ConvScaleRelu
>
)
{
elementwise_ops
+=
1
;
// +1 element-wise relu
}
flop
+=
elementwise_ops
*
conv_out_elems
;
// convolution + elementwise scaling (in + wei + output byte count)
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
ConvOutDataType
>
();
num_btype
+=
sizeof
(
float
)
+
sizeof
(
float
);
// + 2 scales
// elementwise scaling + F8 conversion
num_btype
+=
conv_param
.
GetOutputByte
<
ConvOutDataType
>
()
+
sizeof
(
float
)
+
conv_param
.
GetOutputByte
<
OutDataType
>
();
// AMAX
num_btype
+=
conv_param
.
GetOutputByte
<
ConvOutDataType
>
()
+
sizeof
(
float
);
if
(
time_kernel
)
{
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
std
::
cout
<<
"
\n
Kernels: "
<<
kernels
<<
std
::
endl
;
if
(
do_verification
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
ConvOutDataType
,
InElementOp
,
WeiElementOp
,
ConvElementOp
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
host_conv
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
conv_element_op
);
ref_invoker
.
Run
(
ref_argument
);
conv_device_buf
.
FromDevice
(
device_conv
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
out_host
.
ForEach
([
&
](
auto
&
,
auto
idx
)
{
scale_convert
(
out_host
(
idx
),
host_conv
(
idx
));
});
std
::
cout
<<
"
\n
Comparing output to reference: "
<<
std
::
endl
;
auto
tight_tol_check
=
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: "
);
if
(
!
tight_tol_check
)
{
std
::
cout
<<
"
\n\t
Recompare applying tolerances...
\n
"
;
std
::
cout
<<
"
\t\t
rtol = "
<<
get_rtol
<
OutDataType
>
()
<<
std
::
endl
;
std
::
cout
<<
"
\t\t
atol = "
<<
get_atol
<
OutDataType
>
()
<<
std
::
endl
;
auto
loose_tol_check
=
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect convolution results!"
,
get_rtol
<
OutDataType
>
(),
get_atol
<
OutDataType
>
());
if
(
!
loose_tol_check
)
{
return
false
;
}
}
std
::
cout
<<
"Success!"
<<
std
::
endl
;
/// Verify AMAX
using
RefReduceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceReduce
<
ConvOutDataType
,
ConvOutDataType
,
ConvOutDataType
,
NDimSpatial
+
3
,
NDimSpatial
+
3
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
true
,
false
>
;
auto
ref_reduce
=
RefReduceInstance
{};
auto
ref_reduce_invoker
=
ref_reduce
.
MakeInvokerPointer
();
auto
ref_reduce_argument
=
ref_reduce
.
MakeArgumentPointer
(
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
reduce_out_lengths
,
reduce_out_strides
,
reduce_dims
,
1.0
,
0.0
,
host_conv
.
mData
.
data
(),
nullptr
,
amax_host
.
mData
.
data
(),
nullptr
,
in_elementwise_op
,
acc_elementwise_op
);
if
(
!
ref_reduce
.
IsSupportedArgument
(
ref_reduce_argument
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! RefReduceInstance with the specified compilation parameters does "
"not support this runtime parameters!"
);
};
ref_reduce_invoker
->
Run
(
ref_reduce_argument
.
get
());
amax_device
.
FromDevice
(
amax_from_device
.
mData
.
data
());
std
::
cout
<<
"
\n
amax: "
<<
amax_from_device
.
mData
[
0
]
<<
std
::
endl
;
std
::
cout
<<
"amax_ref: "
<<
amax_host
.
mData
[
0
]
<<
std
::
endl
;
return
ck
::
utils
::
check_err
(
amax_from_device
,
amax_host
,
"Error: incorrect AMAX results!"
);
}
return
true
;
}
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_convscale_reduce_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
ConvOutDataType
=
float
;
// data type of convolution result
using
OutDataType
=
ck
::
f8_t
;
// data type of final result
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ConvScale
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<>
,
ConvOutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
AComputeDataType
,
BComputeDataType
>
;
#include "run_convnd_fwd_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_example
(
argc
,
argv
)
?
0
:
1
;
}
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_convscale_reduce_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
ConvOutDataType
=
float
;
// data type of convolution result
using
OutDataType
=
ck
::
f8_t
;
// data type of final result
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ConvScaleRelu
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<>
,
ConvOutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
AComputeDataType
,
BComputeDataType
>
;
#include "run_convnd_fwd_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_example
(
argc
,
argv
)
?
0
:
1
;
}
example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool
run_convnd_fwd_example
(
int
argc
,
char
*
argv
[])
{
print_helper_msg
();
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
2
,
1
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}};
if
(
argc
==
1
)
{
// use default
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
5
,
argv
);
}
// instantiate in and wei element ops, will
// instantiate out_element_op below for every iteration
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
run
=
[
&
](
auto
ndim_spatial
,
auto
in_layout
,
auto
wei_layout
,
auto
out_layout
)
{
constexpr
ck
::
index_t
ndim_spatial_value
=
ndim_spatial
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
OutLayout
=
decltype
(
out_layout
);
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
return
run_grouped_conv_fwd
<
ndim_spatial_value
,
InDataType
,
WeiDataType
,
ConvOutDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
DeviceGroupedConvNDFwdInstance
<
ndim_spatial_value
,
InLayout
,
WeiLayout
,
OutLayout
>>
(
do_verification
,
init_method
,
time_kernel
,
conv_param
,
in_g_n_c_wis_desc
,
wei_g_k_c_xs_desc
,
out_g_n_k_wos_desc
,
in_element_op
,
wei_element_op
);
};
namespace
ctc
=
ck
::
tensor_layout
::
convolution
;
if
(
conv_param
.
num_dim_spatial_
==
1
)
{
return
run
(
ck
::
Number
<
1
>
{},
ctc
::
GNWC
{},
ctc
::
GKXC
{},
ctc
::
GNWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
2
)
{
return
run
(
ck
::
Number
<
2
>
{},
ctc
::
GNHWC
{},
ctc
::
GKYXC
{},
ctc
::
GNHWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
3
)
{
return
run
(
ck
::
Number
<
3
>
{},
ctc
::
GNDHWC
{},
ctc
::
GKZYXC
{},
ctc
::
GNDHWK
{});
}
return
true
;
}
example/62_convnd_activ/convscale_relu/CMakeLists.txt
0 → 100644
View file @
ef326c73
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_convnd_activ_xdl_convscale_relu
)
add_example_executable
(
example_convnd_fwd_xdl_convscale_relu_fp8 convnd_fwd_xdl_convscale_relu_fp8.cpp
)
add_example_dependencies
(
example_convnd_activ_xdl_convscale_relu example_convnd_fwd_xdl_convscale_relu_fp8
)
set
(
target 1
)
endif
()
endforeach
()
example/62_convnd_activ/convscale_relu/convnd_fwd_convscale_relu_common.hpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ConvScaleRelu
=
ck
::
tensor_operation
::
element_wise
::
ConvScaleRelu
;
void
print_helper_msg
()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: time kernel (0=no, 1=yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_rtol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
1e-1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
1.5e-1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_atol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
16.1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
8192.1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
ck
::
index_t
NumDimSpatial
,
ck
::
index_t
NumNonSpatialDim
=
3
>
std
::
size_t
GetFlops
(
const
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
NumNonSpatialDim
>&
output_lengths
,
const
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
NumNonSpatialDim
>&
weights_lengths
,
const
std
::
size_t
&
ds_size
)
{
// G * N * C * <output spatial lengths product> * (2 * K * <filter spatial lengths product> +
// <number of scale factors>)
ck
::
index_t
G
=
weights_lengths
[
0
];
ck
::
index_t
N
=
output_lengths
[
1
];
ck
::
index_t
K
=
weights_lengths
[
1
];
ck
::
index_t
C
=
weights_lengths
[
2
];
return
G
*
N
*
C
*
std
::
accumulate
(
std
::
next
(
std
::
begin
(
output_lengths
),
NumNonSpatialDim
),
std
::
end
(
output_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<>
())
*
(
static_cast
<
std
::
size_t
>
(
2
)
*
K
*
std
::
accumulate
(
std
::
next
(
std
::
begin
(
weights_lengths
),
NumNonSpatialDim
),
std
::
end
(
weights_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<>
())
+
ds_size
);
}
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv_fwd
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
)
{
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
CShuffleDataType
>
c
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// random scale values
float
scale_in
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
float
scale_wei
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
float
scale_out
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"scale_in: "
<<
scale_in
<<
std
::
endl
;
std
::
cout
<<
"scale_wei: "
<<
scale_wei
<<
std
::
endl
;
std
::
cout
<<
"scale_out: "
<<
scale_out
<<
std
::
endl
;
// initialize out_element_op for each iteration
const
auto
out_element_op
=
OutElementOp
{
scale_in
,
scale_wei
,
scale_out
};
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
ds_size
=
3
+
1
;
// 3 element-wise scale multipliers + 1 element-wise relu
std
::
size_t
flop
=
GetFlops
<
NDimSpatial
>
(
e_g_n_k_wos_lengths
,
b_g_k_c_xs_lengths
,
ds_size
);
std
::
size_t
num_btype
=
conv_param
.
GetInputByte
<
InDataType
>
()
+
conv_param
.
GetWeightByte
<
WeiDataType
>
()
+
sizeof
(
float
)
+
sizeof
(
float
)
+
sizeof
(
float
)
+
conv_param
.
GetOutputByte
<
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
CShuffleDataType
,
InElementOp
,
WeiElementOp
,
PassThrough
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
c
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
out_host
.
ForEach
([
&
](
auto
&
,
auto
idx
)
{
out_element_op
(
out_host
(
idx
),
c
(
idx
));
});
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
,
get_rtol
<
OutDataType
>
(),
get_atol
<
OutDataType
>
());
}
return
true
;
}
example/62_convnd_activ/convscale_relu/convnd_fwd_xdl_convscale_relu_fp8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_convscale_relu_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
OutDataType
=
ck
::
f8_t
;
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ConvScaleRelu
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
DsLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
DsLayout
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
AComputeDataType
,
BComputeDataType
>
;
#include "run_convnd_fwd_convscale_relu_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_example
(
argc
,
argv
)
?
0
:
1
;
}
Prev
1
…
22
23
24
25
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment