Unverified Commit 29deceb6 authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Merge pull request #18 from ROCmSoftwarePlatform/merge-from-public

Merge from public
parents 91c1d147 c997bbf6
...@@ -8,18 +8,18 @@ namespace tensor_operation { ...@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances( void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_GK_Tuple, GK_GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
I32_F32_Tuple, I32_F32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Add_Mul2_Clamp>>>& instances) Add_Mul2_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances( ...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
} }
void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances( void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_GK_Tuple, GK_GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
I32_F32_Tuple, I32_F32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Add_Relu_Mul2_Clamp>>>& instances) Add_Relu_Mul2_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
...@@ -94,18 +94,19 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances( ...@@ -94,18 +94,19 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
} }
void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances( void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_GK_Tuple, GK_GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
I32_F32_Tuple, I32_F32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Add_Mul2_TanH_Mul_Clamp>>>& instances) Add_Mul2_TanH_Mul_Clamp>>>&
instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
......
...@@ -8,18 +8,18 @@ namespace tensor_operation { ...@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances( void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_Tuple, GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
I32_Tuple, I32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Add_Mul_Clamp>>>& instances) Add_Mul_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances( ...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
} }
void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances( void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_Tuple, GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
I32_Tuple, I32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Add_Relu_Mul_Clamp>>>& instances) Add_Relu_Mul_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
...@@ -96,18 +96,19 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances( ...@@ -96,18 +96,19 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
} }
void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances( void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_Tuple, GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
I32_Tuple, I32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Add_Mul_TanH_Mul_Clamp>>>& instances) Add_Mul_TanH_Mul_Clamp>>>&
instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#pragma once #pragma once
#include "conv2d_quantization_common.hpp" #include "conv2d_quantization_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -26,19 +26,19 @@ using device_grouped_conv2d_xdl_int8_instances = ...@@ -26,19 +26,19 @@ using device_grouped_conv2d_xdl_int8_instances =
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>
>; >;
// clang-format on // clang-format on
......
...@@ -8,18 +8,18 @@ namespace tensor_operation { ...@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_conv2d_xdl_perchannel_quantization_int8_instances( void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_Tuple, GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
F32_Tuple, F32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Mul2_Clamp>>>& instances) Mul2_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances( ...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
} }
void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances( void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
GK_Tuple, GK_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
F32_Tuple, F32_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Relu_Mul2_Clamp>>>& instances) Relu_Mul2_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
......
...@@ -8,18 +8,18 @@ namespace tensor_operation { ...@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_conv2d_xdl_perlayer_quantization_int8_instances( void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
Empty_Tuple, Empty_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
Empty_Tuple, Empty_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Mul_Clamp>>>& instances) Mul_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances( ...@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
} }
void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances( void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial, std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC, NHWGC,
GKYXC, GKYXC,
Empty_Tuple, Empty_Tuple,
NHWGK, NHWGK,
int8_t, int8_t,
int8_t, int8_t,
Empty_Tuple, Empty_Tuple,
int8_t, int8_t,
PassThrough, PassThrough,
PassThrough, PassThrough,
Relu_Mul_Clamp>>>& instances) Relu_Mul_Clamp>>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC, device_grouped_conv2d_xdl_int8_instances<NHWGC,
......
add_instance_library(device_transpose_instance
device_transpose_instances_3d.cpp
)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
void add_device_transpose_f16_instances(
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 5>>>&
instances)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_transpose_f16_instances{});
#else
ignore = instances;
#endif
}
void add_device_transpose_f32_instances(
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 5>>>&
instances)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_transpose_f32_instances{});
#else
ignore = instances;
#endif
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -50,21 +50,23 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s ...@@ -50,21 +50,23 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
## Profile contraction kernels ## Profile contraction kernels
```bash ```bash
#arg1: tensor operation (contraction_bilinear=CONTRACTION+Bilinear) #arg1: tensor operation (contraction_bilinear=CONTRACTION+Bilinear)
#arg2: data type (0: fp32; 1: f64)\n" #arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]; #arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]; # 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]; # 2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]) # 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1])
#arg4: verification (0: no; 1: yes) #arg5: verification (0: no; 1: yes)
#arg5: initialization (0: no init; 1: integer value; 2: decimal value) #arg6: initialization (0: no init; 1: integer value; 2: decimal value)
#arg6: print tensor value (0: no; 1: yes) #arg7: print tensor value (0: no; 1: yes)
#arg7: time kernel (0: no, 1: yes) #arg8: time kernel (0: no, 1: yes)
#arg8 and arg9: alpha and beta #arg9: alpha
#arg10 to 15: M0, M1, N0, N1, K0, K1 #arg10: beta
#arg16 to 31: Strides for A, B, D and E (skip for default) #arg11 to 16: M0, M1, N0, N1, K0, K1
#arg17 to 32: Strides for A, B, D and E (skip for default)
################ op datatype layout verify init log time alpha beta M0 M1 N0 N1 K0 K1
./bin/ckProfiler contraction_bilinear 0 1 0 0 0 1 1.0 1.0 128 128 128 128 128 128 ################ op datatype compute_datatype layout verify init log time alpha beta M0 M1 N0 N1 K0 K1
./bin/ckProfiler contraction_bilinear 0 0 1 0 0 0 1 1.0 1.0 128 128 128 128 128 128
``` ```
Result (MI100) Result (MI100)
......
...@@ -31,10 +31,14 @@ namespace profiler { ...@@ -31,10 +31,14 @@ namespace profiler {
using Bilinear = ck::tensor_operation::element_wise::Bilinear; using Bilinear = ck::tensor_operation::element_wise::Bilinear;
using Scale = ck::tensor_operation::element_wise::Scale; using Scale = ck::tensor_operation::element_wise::Scale;
using F32 = float;
using F64 = double;
template <typename ALayout, template <typename ALayout,
typename BLayout, typename BLayout,
typename CDELayout, typename CDELayout,
typename DataType, typename DataType,
typename ComputeDataType,
typename DTupleDataType, typename DTupleDataType,
typename CDElementOp> typename CDElementOp>
int profile_contraction_impl(ck::index_t do_verification, int profile_contraction_impl(ck::index_t do_verification,
...@@ -45,10 +49,10 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -45,10 +49,10 @@ int profile_contraction_impl(ck::index_t do_verification,
const std::vector<ck::index_t>& M, const std::vector<ck::index_t>& M,
const std::vector<ck::index_t>& N, const std::vector<ck::index_t>& N,
const std::vector<ck::index_t>& K, const std::vector<ck::index_t>& K,
const std::vector<ck::index_t>& StridesA, const std::vector<ck::index_t>& StridesA, // [M0, M1, K0, K1]
const std::vector<ck::index_t>& StridesB, const std::vector<ck::index_t>& StridesB, // [N0, N1, K0, K1]
const std::vector<ck::index_t>& StridesE, const std::vector<ck::index_t>& StridesE, // [M0, M1, N0, N1]
const std::vector<ck::index_t>& StridesD) const std::vector<ck::index_t>& StridesD) // [M0, M1, N0, N1]
{ {
bool pass = true; bool pass = true;
...@@ -63,13 +67,13 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -63,13 +67,13 @@ int profile_contraction_impl(ck::index_t do_verification,
}; };
Tensor<DataType> a_m_k(f_host_tensor_descriptor(M, K, StridesA)); Tensor<DataType> a_m_k(f_host_tensor_descriptor(M, K, StridesA));
Tensor<DataType> b_k_n(f_host_tensor_descriptor(K, N, StridesB)); Tensor<DataType> b_n_k(f_host_tensor_descriptor(N, K, StridesB));
Tensor<DataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE)); Tensor<DataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE));
Tensor<DataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StridesE)); Tensor<DataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StridesE));
Tensor<DataType> d_m_n(f_host_tensor_descriptor(M, N, StridesD)); Tensor<DataType> d_m_n(f_host_tensor_descriptor(M, N, StridesD));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_n_k: " << b_n_k.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
...@@ -78,12 +82,12 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -78,12 +82,12 @@ int profile_contraction_impl(ck::index_t do_verification,
case 0: break; case 0: break;
case 1: case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5}); a_m_k.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5}); b_n_k.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
d_m_n.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5}); d_m_n.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
break; break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<DataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<DataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5}); b_n_k.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5});
d_m_n.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5}); d_m_n.GenerateTensorValue(GeneratorTensor_3<DataType>{-0.5, 0.5});
} }
...@@ -91,12 +95,12 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -91,12 +95,12 @@ int profile_contraction_impl(ck::index_t do_verification,
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
DeviceMem a_device_buf(sizeof(DataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(DataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(DataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(DataType) * b_n_k.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(DataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(sizeof(DataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DataType) * d_m_n.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf(sizeof(DataType) * d_m_n.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_n_k.mData.data());
e_device_buf.SetZero(); e_device_buf.SetZero();
d_device_buf.ToDevice(d_m_n.mData.data()); d_device_buf.ToDevice(d_m_n.mData.data());
...@@ -118,7 +122,8 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -118,7 +122,8 @@ int profile_contraction_impl(ck::index_t do_verification,
DataType, DataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CDElementOp>; CDElementOp,
ComputeDataType>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -126,6 +131,9 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -126,6 +131,9 @@ int profile_contraction_impl(ck::index_t do_verification,
std::cout << "found " << op_ptrs.size() << " instances" << std::endl; std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
using AccDataType =
typename std::conditional<std::is_same<ComputeDataType, F64>::value, F64, F32>::type;
// Run reference op // Run reference op
if(do_verification) if(do_verification)
{ {
...@@ -136,7 +144,8 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -136,7 +144,8 @@ int profile_contraction_impl(ck::index_t do_verification,
DataType, DataType,
DataType, DataType,
DataType, DataType,
DataType, AccDataType,
ComputeDataType,
AElementOp, AElementOp,
BElementOp>; BElementOp>;
...@@ -146,7 +155,7 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -146,7 +155,7 @@ int profile_contraction_impl(ck::index_t do_verification,
Tensor<DataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE)); Tensor<DataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StridesE));
auto ref_argument = auto ref_argument =
ref_op.MakeArgument(a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op); ref_op.MakeArgument(a_m_k, b_n_k, c_m_n_host_result, a_element_op, b_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
...@@ -272,8 +281,29 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -272,8 +281,29 @@ int profile_contraction_impl(ck::index_t do_verification,
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
float threshold = // Both the kernel and the reference use `AccDataType`, so an absolute error of both
static_cast<DataType>(nelems_k) * std::numeric_limits<DataType>::epsilon(); // of them is bounded by `nelems_k * std::numeric_limits<AccDataType>::epsilon()`.
// Comparing one to another can result in an absolute error as high as twice that
// value.
double threshold = 2 * nelems_k * std::numeric_limits<AccDataType>::epsilon();
// Handle the possible casting error of either AccDataType -> DataType or
// DataType -> ComputeDataType.
// TODO: Add a generic solution for calculating thresholds in CK.
if constexpr(ck::is_same_v<DataType, ck::bhalf_t> ||
ck::is_same_v<ComputeDataType, ck::bhalf_t>)
{
const double epsilon = std::pow(2, -7);
// Maximum relative casting error when rounding to zero.
threshold += epsilon * 2;
}
else if constexpr(ck::is_same_v<DataType, ck::half_t> ||
ck::is_same_v<ComputeDataType, ck::half_t>)
{
const double epsilon = std::pow(2, -10);
// Maximum relative casting error when rounding to zero.
threshold += epsilon * 2;
}
pass = pass & ck::utils::check_err(e_m_n_device_result, pass = pass & ck::utils::check_err(e_m_n_device_result,
e_m_n_host_result, e_m_n_host_result,
"Error: incorrect results!", "Error: incorrect results!",
...@@ -283,7 +313,7 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -283,7 +313,7 @@ int profile_contraction_impl(ck::index_t do_verification,
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "b: ", b_n_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", e_m_n_host_result.mData, ",") LogRangeAsType<float>(std::cout << "c_host : ", e_m_n_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", e_m_n_device_result.mData, ",") LogRangeAsType<float>(std::cout << "c_device: ", e_m_n_device_result.mData, ",")
......
...@@ -23,8 +23,18 @@ enum struct ContractionMatrixLayout ...@@ -23,8 +23,18 @@ enum struct ContractionMatrixLayout
enum struct ContractionDataType enum struct ContractionDataType
{ {
F32_F32_F32_F32, // 0 F32_F32_F32_F32, // 0
F64_F64_F64_F64, // 1 F64_F64_F64_F64, // 1
F16_F16_F16_F16, // 2
BF16_BF16_BF16_BF16, // 3
};
enum struct ContractionComputeDataType
{
F32 = 0,
F64,
F16,
BF16,
}; };
inline void collect_index_params(char* argv[], inline void collect_index_params(char* argv[],
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <typeinfo> #include <typeinfo>
#include <unistd.h>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...@@ -20,6 +21,7 @@ ...@@ -20,6 +21,7 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/fill.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -69,14 +71,17 @@ int profile_gemm_impl(int do_verification, ...@@ -69,14 +71,17 @@ int profile_gemm_impl(int do_verification,
switch(init_method) switch(init_method)
{ {
case 0: break; case 0:
ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
break;
case 1: case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break; break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 0.1}); ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.05, 0.05}); ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
} }
using AElementOp = ck::tensor_operation::element_wise::PassThrough; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
...@@ -130,11 +135,10 @@ int profile_gemm_impl(int do_verification, ...@@ -130,11 +135,10 @@ int profile_gemm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
std::string best_op_name; float best_tflops = 0;
float best_avg_time = 0; int best_instance_id = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int instance_id = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
...@@ -162,7 +166,7 @@ int profile_gemm_impl(int do_verification, ...@@ -162,7 +166,7 @@ int profile_gemm_impl(int do_verification,
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float avg_time = float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, 10, 50});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = std::size_t(2) * M * N * K;
...@@ -178,10 +182,8 @@ int profile_gemm_impl(int do_verification, ...@@ -178,10 +182,8 @@ int profile_gemm_impl(int do_verification,
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
best_op_name = op_name; best_instance_id = instance_id;
best_tflops = tflops; best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
} }
if(do_verification) if(do_verification)
...@@ -205,53 +207,94 @@ int profile_gemm_impl(int do_verification, ...@@ -205,53 +207,94 @@ int profile_gemm_impl(int do_verification,
{ {
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
} }
}
if constexpr(is_same<CDataType, float>::value) instance_id++;
{
std::cout << "Best Perf for datatype = f32";
}
else if constexpr(is_same<CDataType, half_t>::value)
{
std::cout << "Best Perf for datatype = f16";
} }
else if constexpr(is_same<CDataType, bhalf_t>::value)
{ sleep(2);
std::cout << "Best Perf for datatype = bf16";
} // Run the best instance again
else if constexpr(is_same<CDataType, int8_t>::value)
{ {
std::cout << "Best Perf for datatype = int8"; auto& op_ptr = op_ptrs[best_instance_id];
} auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::string op_name = op_ptr->GetTypeString();
float avg_time = invoker_ptr->Run(argument_ptr.get(),
StreamConfig{nullptr, time_kernel, 0, 50, 200});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
if constexpr(is_same<CDataType, float>::value)
{
std::cout << "Best Perf for datatype = f32";
}
else if constexpr(is_same<CDataType, half_t>::value)
{
std::cout << "Best Perf for datatype = f16";
}
else if constexpr(is_same<CDataType, bhalf_t>::value)
{
std::cout << "Best Perf for datatype = bf16";
}
else if constexpr(is_same<CDataType, int8_t>::value)
{
std::cout << "Best Perf for datatype = int8";
}
#if defined CK_ENABLE_FP8 #if defined CK_ENABLE_FP8
else if constexpr(is_same<CDataType, f8_t>::value) else if constexpr(is_same<CDataType, f8_t>::value)
{ {
std::cout << "Best Perf for datatype = fp8"; std::cout << "Best Perf for datatype = fp8";
} }
#endif #endif
if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value) if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
{ {
std::cout << " ALayout = RowMajor"; std::cout << " ALayout = RowMajor";
} }
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value) else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
{ {
std::cout << " ALayout = ColumnMajor"; std::cout << " ALayout = ColumnMajor";
} }
if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value) if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
{ {
std::cout << " BLayout = RowMajor"; std::cout << " BLayout = RowMajor";
} }
else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value) else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
{ {
std::cout << " BLayout = ColumnMajor"; std::cout << " BLayout = ColumnMajor";
} }
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_avg_time << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << avg_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << op_name
<< best_op_name << std::endl; << std::endl;
}
}
return pass ? 0 : 1; return pass ? 0 : 1;
} }
......
...@@ -143,8 +143,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -143,8 +143,7 @@ bool profile_gemm_splitk_impl(int do_verification,
// profile device GEMM instances // profile device GEMM instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 36, 40, 60, std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 32, 36, 40, 64, 96, 128};
64, 72, 80, 88, 96, 128, 144, 160, 176, 192, 256};
if(KBatch > 0) if(KBatch > 0)
{ {
......
...@@ -198,18 +198,18 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -198,18 +198,18 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
} }
}; };
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<>, ck::Tuple<>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
ck::Tuple<>, ck::Tuple<>,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -88,14 +88,14 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -88,14 +88,14 @@ bool profile_groupnorm_impl(int do_verification,
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.mData.data());
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
YDataType, YDataType,
SaveMeanInvStdDataType, SaveMeanInvStdDataType,
PassThrough, PassThrough,
5, 5,
3>; 3>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -94,14 +94,14 @@ bool profile_layernorm_impl(int do_verification, ...@@ -94,14 +94,14 @@ bool profile_layernorm_impl(int do_verification,
constexpr int NumReduceDim = Rank - 1; constexpr int NumReduceDim = Rank - 1;
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
YDataType, YDataType,
SaveMeanInvStdDataType, SaveMeanInvStdDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace profiler {
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
{
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
{
auto a_val = A_ncdhw(n, c, d, h, w);
functor(B_nchwd(n, c, h, w, d), a_val);
}
}
template <typename ADataType, typename BDataType, index_t NumDim>
bool profile_transpose_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> lengths)
{
bool pass = true;
index_t N = lengths[0];
index_t C = lengths[1];
index_t D = lengths[2];
index_t H = lengths[3];
index_t W = lengths[4];
std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
std::vector<ck::index_t> ndhwc = {N, D, H, W, C};
Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(ndhwc);
Tensor<BDataType> host_b(ndhwc);
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
std::cout << "A: " << a.mDesc << std::endl;
std::cout << "B: " << b.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
}
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
// const auto element_op = ElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
using DeviceOp = ck::tensor_operation::device::
DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, NumDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
if(do_verification)
{
host_elementwise4D(host_b, a, ElementOp{});
}
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init C to zero before profiling next kernel
b_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
}
}
std::string op_name = op_ptr->GetTypeString();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop =
std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
std::size_t num_btype =
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W
<< " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
<< " GB/s, " << best_op_name << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
...@@ -16,8 +16,8 @@ set(PROFILER_SOURCES ...@@ -16,8 +16,8 @@ set(PROFILER_SOURCES
profile_grouped_conv_fwd.cpp profile_grouped_conv_fwd.cpp
profile_grouped_conv_bwd_weight.cpp profile_grouped_conv_bwd_weight.cpp
profile_reduce.cpp profile_reduce.cpp
profile_groupnorm.cpp profile_groupnorm_fwd.cpp
profile_layernorm.cpp profile_layernorm_fwd.cpp
profile_max_pool3d_fwd.cpp profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp profile_max_pool3d_bwd.cpp
...@@ -28,9 +28,11 @@ set(PROFILER_SOURCES ...@@ -28,9 +28,11 @@ set(PROFILER_SOURCES
profile_grouped_conv_bwd_data.cpp profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp profile_conv_tensor_rearrange.cpp
) )
if(DL_KERNELS) if(DL_KERNELS)
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
endif() endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
...@@ -75,7 +77,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_w ...@@ -75,7 +77,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_w
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
...@@ -110,4 +112,5 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) ...@@ -110,4 +112,5 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
endif() endif()
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: f64)\n" << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + " << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + " << " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
...@@ -26,40 +27,42 @@ static void print_helper_msg() ...@@ -26,40 +27,42 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + " << " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
<< "arg4: verification (0: no; 1: yes)\n" << "arg5: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal " << "arg6: initialization (0: no init; 1: integer value; 2: decimal "
<< "value)\n" << "value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< "arg8 and arg9: alpha and beta\n" << "arg9: alpha\n"
<< "arg10 to 15: M0, M1, N0, N1, K0, K1\n" << "arg10: beta\n"
<< "arg16 to 31: Strides for A, B, D and E (skip for default)\n" << "arg11 to 16: M0, M1, N0, N1, K0, K1\n"
<< "arg17 to 32: Strides for A, B, D and E (skip for default)\n"
<< std::endl; << std::endl;
} }
int profile_contraction_bilinear(int argc, char* argv[]) int profile_contraction_bilinear(int argc, char* argv[])
{ {
const bool default_strides = argc == 16; const bool default_strides = argc == 17;
if(argc != 32 && argc != 16) if(argc != 33 && argc != 17)
{ {
print_helper_msg(); print_helper_msg();
exit(1); exit(1);
} }
const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[3])); const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[4]));
const ck::index_t init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const ck::index_t init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const float alpha = std::stof(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const float beta = std::stof(argv[9]); const float alpha = std::stof(argv[9]);
const float beta = std::stof(argv[10]);
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
const ck::index_t dims_arg_num = 10; const ck::index_t dims_arg_num = 11;
collect_index_params(argv, M, dims_arg_num, 2); collect_index_params(argv, M, dims_arg_num, 2);
collect_index_params(argv, N, dims_arg_num + 2, 2); collect_index_params(argv, N, dims_arg_num + 2, 2);
collect_index_params(argv, K, dims_arg_num + 4, 2); collect_index_params(argv, K, dims_arg_num + 4, 2);
...@@ -76,90 +79,130 @@ int profile_contraction_bilinear(int argc, char* argv[]) ...@@ -76,90 +79,130 @@ int profile_contraction_bilinear(int argc, char* argv[])
collect_index_params(argv, StridesD, dims_arg_num + 18, 4); collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
} }
using F32 = float; using F16 = ck::half_t;
using F64 = double; using BF16 = ck::bhalf_t;
using F32 = float;
auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) { using F64 = double;
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); auto profile =
using CDELayout = decltype(cde_layout); [&](auto a_layout, auto b_layout, auto cde_layout, auto type, auto compute_type) {
using ALayout = decltype(a_layout);
using DataType = decltype(type); using BLayout = decltype(b_layout);
using CDELayout = decltype(cde_layout);
if(default_strides)
using DataType = decltype(type);
using ComputeDataType = decltype(compute_type);
if(default_strides)
{
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
}
bool pass = ck::profiler::profile_contraction_impl<ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<DataType>,
Bilinear>(do_verification,
init_method,
do_log,
time_kernel,
Bilinear{alpha, beta},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
};
auto run_profile_for_datatype = [&](auto type, auto compute_type) {
if(layout == ContractionMatrixLayout::MK_KN_MN_MN)
{ {
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]}); return profile(Row{}, Row{}, Row{}, type, compute_type);
assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
} }
bool pass = ck::profiler::profile_contraction_impl<ALayout, else if(layout == ContractionMatrixLayout::MK_NK_MN_MN)
BLayout, {
CDELayout, return profile(Row{}, Col{}, Row{}, type, compute_type);
DataType, }
ck::Tuple<DataType>, else if(layout == ContractionMatrixLayout::KM_KN_MN_MN)
Bilinear>(do_verification, {
init_method, return profile(Col{}, Row{}, Row{}, type, compute_type);
do_log, }
time_kernel, else if(layout == ContractionMatrixLayout::KM_NK_MN_MN)
Bilinear{alpha, beta}, {
M, return profile(Col{}, Col{}, Row{}, type, compute_type);
N, }
K, return false;
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
}; };
if(data_type == ContractionDataType::F32_F32_F32_F32 && if(data_type == ContractionDataType::F32_F32_F32_F32)
layout == ContractionMatrixLayout::MK_KN_MN_MN)
{
return profile(Row{}, Row{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{ {
return profile(Row{}, Col{}, Row{}, F32{}); if(compute_data_type == ContractionComputeDataType::F32)
} {
else if(data_type == ContractionDataType::F32_F32_F32_F32 && return run_profile_for_datatype(F32{}, F32{});
layout == ContractionMatrixLayout::KM_KN_MN_MN) }
{ else if(compute_data_type == ContractionComputeDataType::F16)
return profile(Col{}, Row{}, Row{}, F32{}); {
} return run_profile_for_datatype(F32{}, F16{});
else if(data_type == ContractionDataType::F32_F32_F32_F32 && }
layout == ContractionMatrixLayout::KM_NK_MN_MN) else if(compute_data_type == ContractionComputeDataType::BF16)
{ {
return profile(Col{}, Col{}, Row{}, F32{}); return run_profile_for_datatype(F32{}, BF16{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else
layout == ContractionMatrixLayout::MK_KN_MN_MN) {
{ std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return profile(Row{}, Row{}, Row{}, F64{}); return 1;
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{
return profile(Row{}, Col{}, Row{}, F64{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F64_F64_F64_F64)
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F64)
{
return run_profile_for_datatype(F64{}, F64{});
}
else if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F64{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F16_F16_F16_F16)
layout == ContractionMatrixLayout::KM_NK_MN_MN)
{ {
return profile(Col{}, Col{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else else if(data_type == ContractionDataType::BF16_BF16_BF16_BF16)
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; if(compute_data_type == ContractionComputeDataType::F32)
{
return 1; return run_profile_for_datatype(BF16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
return 1;
} }
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_bilinear); REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_bilinear);
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: f64)\n" << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + " << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + " << " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
...@@ -26,39 +27,40 @@ static void print_helper_msg() ...@@ -26,39 +27,40 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + " << " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
<< "arg4: verification (0: no; 1: yes)\n" << "arg5: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal " << "arg6: initialization (0: no init; 1: integer value; 2: decimal "
<< "value)\n" << "value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< "arg8: alpha\n" << "arg9: alpha\n"
<< "arg9 to 14: M0, M1, N0, N1, K0, K1\n" << "arg10 to 15: M0, M1, N0, N1, K0, K1\n"
<< "arg15 to 30: Strides for A, B, D and E (skip for default)\n" << "arg16 to 31: Strides for A, B, D and E (skip for default)\n"
<< std::endl; << std::endl;
} }
int profile_contraction_scale(int argc, char* argv[]) int profile_contraction_scale(int argc, char* argv[])
{ {
const bool default_strides = argc == 15; const bool default_strides = argc == 16;
if(argc != 31 && argc != 15) if(argc != 32 && argc != 16)
{ {
print_helper_msg(); print_helper_msg();
exit(1); exit(1);
} }
const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[3])); const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[4]));
const ck::index_t init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const ck::index_t init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const float alpha = std::stof(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const float alpha = std::stof(argv[9]);
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
const ck::index_t dims_arg_num = 9; const ck::index_t dims_arg_num = 10;
collect_index_params(argv, M, dims_arg_num, 2); collect_index_params(argv, M, dims_arg_num, 2);
collect_index_params(argv, N, dims_arg_num + 2, 2); collect_index_params(argv, N, dims_arg_num + 2, 2);
collect_index_params(argv, K, dims_arg_num + 4, 2); collect_index_params(argv, K, dims_arg_num + 4, 2);
...@@ -75,88 +77,131 @@ int profile_contraction_scale(int argc, char* argv[]) ...@@ -75,88 +77,131 @@ int profile_contraction_scale(int argc, char* argv[])
collect_index_params(argv, StridesD, dims_arg_num + 18, 4); collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
} }
using F32 = float; using F16 = ck::half_t;
using F64 = double; using BF16 = ck::bhalf_t;
using F32 = float;
auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) { using F64 = double;
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); auto profile =
using CDELayout = decltype(cde_layout); [&](auto a_layout, auto b_layout, auto cde_layout, auto type, auto compute_type) {
using ALayout = decltype(a_layout);
using DataType = decltype(type); using BLayout = decltype(b_layout);
using CDELayout = decltype(cde_layout);
if(default_strides)
using DataType = decltype(type);
using ComputeDataType = decltype(compute_type);
if(default_strides)
{
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
}
bool pass = ck::profiler::profile_contraction_impl<ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<>,
Scale>(do_verification,
init_method,
do_log,
time_kernel,
Scale{alpha},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
};
auto run_profile_for_datatype = [&](auto type, auto compute_type) {
if(layout == ContractionMatrixLayout::MK_KN_MN_MN)
{ {
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]}); return profile(Row{}, Row{}, Row{}, type, compute_type);
assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
} }
else if(layout == ContractionMatrixLayout::MK_NK_MN_MN)
bool pass = ck::profiler:: {
profile_contraction_impl<ALayout, BLayout, CDELayout, DataType, ck::Tuple<>, Scale>( return profile(Row{}, Col{}, Row{}, type, compute_type);
do_verification, }
init_method, else if(layout == ContractionMatrixLayout::KM_KN_MN_MN)
do_log, {
time_kernel, return profile(Col{}, Row{}, Row{}, type, compute_type);
Scale{alpha}, }
M, else if(layout == ContractionMatrixLayout::KM_NK_MN_MN)
N, {
K, return profile(Col{}, Col{}, Row{}, type, compute_type);
StridesA, }
StridesB, return false;
StridesE,
StridesD);
return pass;
}; };
if(data_type == ContractionDataType::F32_F32_F32_F32 && if(data_type == ContractionDataType::F32_F32_F32_F32)
layout == ContractionMatrixLayout::MK_KN_MN_MN)
{
return profile(Row{}, Row{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{
return profile(Row{}, Col{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F32{}); if(compute_data_type == ContractionComputeDataType::F32)
} {
else if(data_type == ContractionDataType::F32_F32_F32_F32 && return run_profile_for_datatype(F32{}, F32{});
layout == ContractionMatrixLayout::KM_NK_MN_MN) }
{ else if(compute_data_type == ContractionComputeDataType::F16)
return profile(Col{}, Col{}, Row{}, F32{}); {
} return run_profile_for_datatype(F32{}, F16{});
else if(data_type == ContractionDataType::F64_F64_F64_F64 && }
layout == ContractionMatrixLayout::MK_KN_MN_MN) else if(compute_data_type == ContractionComputeDataType::BF16)
{ {
return profile(Row{}, Row{}, Row{}, F64{}); return run_profile_for_datatype(F32{}, BF16{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else
layout == ContractionMatrixLayout::MK_NK_MN_MN) {
{ std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return profile(Row{}, Col{}, Row{}, F64{}); return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F64_F64_F64_F64)
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F64)
{
return run_profile_for_datatype(F64{}, F64{});
}
else if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F64{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F16_F16_F16_F16)
layout == ContractionMatrixLayout::KM_NK_MN_MN)
{ {
return profile(Col{}, Col{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else else if(data_type == ContractionDataType::BF16_BF16_BF16_BF16)
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; if(compute_data_type == ContractionComputeDataType::F32)
{
return 1; return run_profile_for_datatype(BF16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
return 1;
} }
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_scale); REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_scale);
...@@ -27,6 +27,8 @@ enum struct GemmDataType ...@@ -27,6 +27,8 @@ enum struct GemmDataType
F16_F16_F16, // 1 F16_F16_F16, // 1
BF16_BF16_BF16, // 2 BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3 INT8_INT8_INT8, // 3
F8_F16_F16, // 4
F16_F8_F16, // 5
}; };
#define OP_NAME "grouped_gemm" #define OP_NAME "grouped_gemm"
...@@ -56,7 +58,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -56,7 +58,7 @@ int profile_grouped_gemm(int argc, char* argv[])
{ {
std::cout std::cout
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n" << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: fp8@fp6; 5: f16@f8)\n"
<< "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n" << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
<< " 1: A[m, k] * B[n, k] = C[m, n];\n" << " 1: A[m, k] * B[n, k] = C[m, n];\n"
<< " 2: A[k, m] * B[k, n] = C[m, n];\n" << " 2: A[k, m] * B[k, n] = C[m, n];\n"
...@@ -169,6 +171,46 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -169,6 +171,46 @@ int profile_grouped_gemm(int argc, char* argv[])
StrideCs, StrideCs,
kbatch); kbatch);
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::f8_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch);
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::f8_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch);
}
else else
{ {
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment