Commit f8b551da authored by carlushuang's avatar carlushuang
Browse files

add bias_relu, bias fusion

parent bfa4c686
...@@ -37,6 +37,8 @@ template <typename InDataType, ...@@ -37,6 +37,8 @@ template <typename InDataType,
bool UseALocalBuffer, bool UseALocalBuffer,
bool UseBLocalBuffer, bool UseBLocalBuffer,
bool UseCLocalBuffer, bool UseCLocalBuffer,
bool FuseBias,
bool FuseAdd,
bool BiasAlongGemmM> bool BiasAlongGemmM>
struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
: public DeviceConvFwdBiasActivationAdd<InElementwiseOperation, : public DeviceConvFwdBiasActivationAdd<InElementwiseOperation,
...@@ -607,8 +609,13 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu ...@@ -607,8 +609,13 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
!UseBLocalBuffer, !UseBLocalBuffer,
ConvForwardSpecialization>; ConvForwardSpecialization>;
using CThreadwiseCopy = static constexpr auto GetCThreadwiseCopy()
ck::cpu::ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_MxN< {
constexpr ck::index_t C_nDim = CGridDesc::GetNumOfDimension();
if constexpr(FuseBias && FuseAdd)
{
return ck::cpu::
ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_MxN<
CDataType, CDataType,
C0DataType, C0DataType,
C1DataType, C1DataType,
...@@ -619,7 +626,34 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu ...@@ -619,7 +626,34 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
decltype(GetOutputBlockDescriptor()), decltype(GetOutputBlockDescriptor()),
OutElementwiseOperation, OutElementwiseOperation,
!UseCLocalBuffer, !UseCLocalBuffer,
BiasAlongGemmM>; BiasAlongGemmM>(CGridDesc{},
ck::make_zero_multi_index<C_nDim>(),
GetOutputBlockDescriptor(),
ck::make_zero_multi_index<C_nDim>(),
OutElementwiseOperation{});
}
else if constexpr(FuseBias && !FuseAdd)
{
return ck::cpu::ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN<
CDataType,
C0DataType,
C1DataType,
CDataType,
CGridDesc,
C0GridDesc,
C1GridDesc,
decltype(GetOutputBlockDescriptor()),
OutElementwiseOperation,
!UseCLocalBuffer,
BiasAlongGemmM>(CGridDesc{},
ck::make_zero_multi_index<C_nDim>(),
GetOutputBlockDescriptor(),
ck::make_zero_multi_index<C_nDim>(),
OutElementwiseOperation{});
}
}
using CThreadwiseCopy = decltype(GetCThreadwiseCopy());
using GridwiseGemm = ck::cpu::GridwiseGemmBiasActivationAddAvx2_MxN< using GridwiseGemm = ck::cpu::GridwiseGemmBiasActivationAddAvx2_MxN<
ADataType, // InDataType, ADataType, // InDataType,
......
...@@ -37,6 +37,8 @@ template <typename InDataType, ...@@ -37,6 +37,8 @@ template <typename InDataType,
bool UseALocalBuffer, bool UseALocalBuffer,
bool UseBLocalBuffer, bool UseBLocalBuffer,
bool UseCLocalBuffer, bool UseCLocalBuffer,
bool FuseBias,
bool FuseAdd,
bool BiasAlongGemmM> bool BiasAlongGemmM>
struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
: public DeviceConvFwdBiasActivationAdd<InElementwiseOperation, : public DeviceConvFwdBiasActivationAdd<InElementwiseOperation,
...@@ -584,8 +586,13 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou ...@@ -584,8 +586,13 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
!UseBLocalBuffer, !UseBLocalBuffer,
ConvForwardSpecialization>; ConvForwardSpecialization>;
using CThreadwiseCopy = static constexpr auto GetCThreadwiseCopy()
ck::cpu::ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_MxN< {
constexpr ck::index_t C_nDim = CGridDesc::GetNumOfDimension();
if constexpr(FuseBias && FuseAdd)
{
return ck::cpu::
ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_MxN<
CDataType, CDataType,
C0DataType, C0DataType,
C1DataType, C1DataType,
...@@ -596,7 +603,34 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou ...@@ -596,7 +603,34 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
decltype(GetOutputBlockDescriptor()), decltype(GetOutputBlockDescriptor()),
OutElementwiseOperation, OutElementwiseOperation,
!UseCLocalBuffer, !UseCLocalBuffer,
BiasAlongGemmM>; BiasAlongGemmM>(CGridDesc{},
ck::make_zero_multi_index<C_nDim>(),
GetOutputBlockDescriptor(),
ck::make_zero_multi_index<C_nDim>(),
OutElementwiseOperation{});
}
else if constexpr(FuseBias && !FuseAdd)
{
return ck::cpu::ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN<
CDataType,
C0DataType,
C1DataType,
CDataType,
CGridDesc,
C0GridDesc,
C1GridDesc,
decltype(GetOutputBlockDescriptor()),
OutElementwiseOperation,
!UseCLocalBuffer,
BiasAlongGemmM>(CGridDesc{},
ck::make_zero_multi_index<C_nDim>(),
GetOutputBlockDescriptor(),
ck::make_zero_multi_index<C_nDim>(),
OutElementwiseOperation{});
}
}
using CThreadwiseCopy = decltype(GetCThreadwiseCopy());
using GridwiseGemm = ck::cpu::GridwiseGemmBiasActivationAddAvx2_MxN< using GridwiseGemm = ck::cpu::GridwiseGemmBiasActivationAddAvx2_MxN<
ADataType, // InDataType, ADataType, // InDataType,
......
...@@ -36,6 +36,8 @@ template <typename InDataType, ...@@ -36,6 +36,8 @@ template <typename InDataType,
bool UseALocalBuffer, bool UseALocalBuffer,
bool UseBLocalBuffer, bool UseBLocalBuffer,
bool UseCLocalBuffer, bool UseCLocalBuffer,
bool FuseBias,
bool FuseAdd,
bool BiasAlongGemmM> bool BiasAlongGemmM>
struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
: public DeviceConvFwdBiasActivationAdd<InElementwiseOperation, : public DeviceConvFwdBiasActivationAdd<InElementwiseOperation,
...@@ -580,8 +582,13 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu ...@@ -580,8 +582,13 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
!UseBLocalBuffer, !UseBLocalBuffer,
ConvForwardSpecialization>; ConvForwardSpecialization>;
using CThreadwiseCopy = static constexpr auto GetCThreadwiseCopy()
ck::cpu::ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_MxN< {
constexpr ck::index_t C_nDim = CGridDesc::GetNumOfDimension();
if constexpr(FuseBias && FuseAdd)
{
return ck::cpu::
ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_MxN<
CDataType, CDataType,
C0DataType, C0DataType,
C1DataType, C1DataType,
...@@ -592,7 +599,34 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu ...@@ -592,7 +599,34 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
decltype(GetOutputBlockDescriptor()), decltype(GetOutputBlockDescriptor()),
OutElementwiseOperation, OutElementwiseOperation,
!UseCLocalBuffer, !UseCLocalBuffer,
BiasAlongGemmM>; BiasAlongGemmM>(CGridDesc{},
ck::make_zero_multi_index<C_nDim>(),
GetOutputBlockDescriptor(),
ck::make_zero_multi_index<C_nDim>(),
OutElementwiseOperation{});
}
else if constexpr(FuseBias && !FuseAdd)
{
return ck::cpu::ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN<
CDataType,
C0DataType,
C1DataType,
CDataType,
CGridDesc,
C0GridDesc,
C1GridDesc,
decltype(GetOutputBlockDescriptor()),
OutElementwiseOperation,
!UseCLocalBuffer,
BiasAlongGemmM>(CGridDesc{},
ck::make_zero_multi_index<C_nDim>(),
GetOutputBlockDescriptor(),
ck::make_zero_multi_index<C_nDim>(),
OutElementwiseOperation{});
}
}
using CThreadwiseCopy = decltype(GetCThreadwiseCopy());
using GridwiseGemm = ck::cpu::GridwiseGemmBiasActivationAddAvx2_MxN< using GridwiseGemm = ck::cpu::GridwiseGemmBiasActivationAddAvx2_MxN<
ADataType, // InDataType, ADataType, // InDataType,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment