"test/git@developer.sourcefind.cn:change/sglang.git" did not exist on "1df6eabd5d362b4e3ac53b8d195e0f8c14d22b54"
Commit f29a5350 authored by carlushuang's avatar carlushuang
Browse files

add direct-conv first version

parent 19a6cc89
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "reference_conv_fwd.hpp" #include "reference_conv_fwd.hpp"
#include "element_wise_operation_cpu.hpp" #include "element_wise_operation_cpu.hpp"
#include "dynamic_buffer_cpu.hpp" #include "dynamic_buffer_cpu.hpp"
#include "envvar.hpp"
#include <omp.h> #include <omp.h>
#define AVX2_DATA_ALIGNMENT 32 #define AVX2_DATA_ALIGNMENT 32
...@@ -92,6 +93,10 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_local_c_relu( ...@@ -92,6 +93,10 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_local_c_relu(
void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_mt_relu( void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_mt_relu(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances); std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
// ------------------ direct-conv nhwc-kcyxk8-nhwk
void add_device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk(
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
} // namespace device_conv2d_fwd_avx2_instance } // namespace device_conv2d_fwd_avx2_instance
} // namespace device } // namespace device
} // namespace cpu } // namespace cpu
...@@ -501,6 +506,8 @@ int main(int argc, char* argv[]) ...@@ -501,6 +506,8 @@ int main(int argc, char* argv[])
ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance:: ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(conv_ptrs); add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(conv_ptrs);
} }
ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
add_device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk(conv_ptrs);
#endif #endif
#if TEST_FUSION == TEST_FUSION_RELU #if TEST_FUSION == TEST_FUSION_RELU
if(omp_get_max_threads() > 1) if(omp_get_max_threads() > 1)
...@@ -571,6 +578,7 @@ int main(int argc, char* argv[]) ...@@ -571,6 +578,7 @@ int main(int argc, char* argv[])
double fastest_kernel_time = std::numeric_limits<double>::max(); double fastest_kernel_time = std::numeric_limits<double>::max();
std::string fastest_kernel_name = ""; std::string fastest_kernel_name = "";
double fastest_kernel_gflops = 0; double fastest_kernel_gflops = 0;
int loop = ck::getenv_int("CK_LOOP", 10);
for(auto& conv_ptr : conv_ptrs) for(auto& conv_ptr : conv_ptrs)
{ {
auto argument_ptr = conv_ptr->MakeArgumentPointer( auto argument_ptr = conv_ptr->MakeArgumentPointer(
...@@ -594,7 +602,7 @@ int main(int argc, char* argv[]) ...@@ -594,7 +602,7 @@ int main(int argc, char* argv[])
if(conv_ptr->IsSupportedArgument(argument_ptr.get())) if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
auto invoker_ptr = conv_ptr->MakeInvokerPointer(); auto invoker_ptr = conv_ptr->MakeInvokerPointer();
double time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{}, 10); double time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{}, loop);
double total_flop = static_cast<double>(2) * N * C * Ho * Wo * K * Y * X; double total_flop = static_cast<double>(2) * N * C * Ho * Wo * K * Y * X;
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "reference_conv_fwd_bias_activation.hpp" #include "reference_conv_fwd_bias_activation.hpp"
#include "element_wise_operation_cpu.hpp" #include "element_wise_operation_cpu.hpp"
#include "dynamic_buffer_cpu.hpp" #include "dynamic_buffer_cpu.hpp"
#include "envvar.hpp"
#include <omp.h> #include <omp.h>
#define AVX2_DATA_ALIGNMENT 32 #define AVX2_DATA_ALIGNMENT 32
...@@ -713,6 +714,7 @@ int main(int argc, char* argv[]) ...@@ -713,6 +714,7 @@ int main(int argc, char* argv[])
double fastest_kernel_time = std::numeric_limits<double>::max(); double fastest_kernel_time = std::numeric_limits<double>::max();
std::string fastest_kernel_name = ""; std::string fastest_kernel_name = "";
double fastest_kernel_gflops = 0; double fastest_kernel_gflops = 0;
int loop = ck::getenv_int("CK_LOOP", 10);
for(auto& conv_ptr : conv_ptrs) for(auto& conv_ptr : conv_ptrs)
{ {
auto argument_ptr = conv_ptr->MakeArgumentPointer( auto argument_ptr = conv_ptr->MakeArgumentPointer(
...@@ -738,7 +740,7 @@ int main(int argc, char* argv[]) ...@@ -738,7 +740,7 @@ int main(int argc, char* argv[])
if(conv_ptr->IsSupportedArgument(argument_ptr.get())) if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
auto invoker_ptr = conv_ptr->MakeInvokerPointer(); auto invoker_ptr = conv_ptr->MakeInvokerPointer();
double time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{}, 10); double time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{}, loop);
double total_flop = static_cast<double>(2) * N * C * Ho * Wo * K * Y * X; double total_flop = static_cast<double>(2) * N * C * Ho * Wo * K * Y * X;
......
...@@ -855,7 +855,7 @@ struct ThreadwiseGemmAvx2_MxN_4x24 ...@@ -855,7 +855,7 @@ struct ThreadwiseGemmAvx2_MxN_4x24
".if m_TransA != 0\n" ".if m_TransA != 0\n"
" lea m_ABytes(%%rax), %%rax\n" " lea m_ABytes(%%rax), %%rax\n"
".if m_Mr > 3\n lea m_ABytes(%%r8), %%r8\n .endif\n" ".if m_Mr > 2\n lea m_ABytes(%%r8), %%r8\n .endif\n"
".else\n" ".else\n"
" lea (%%rax, %%rcx, 1), %%rax\n" " lea (%%rax, %%rcx, 1), %%rax\n"
" lea (%%r8, %%rcx, 1), %%r8\n" " lea (%%r8, %%rcx, 1), %%r8\n"
......
#include <stdlib.h>
#include <utility>
#include "config.hpp"
#include "convolution_forward_specialization_cpu.hpp"
#include "device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace cpu {
namespace device {
namespace device_conv2d_fwd_avx2_instance {
using InType = float;
using WeiType = float;
using OutType = float;
using AccType = float;
using InLayout = ck::tensor_layout::gemm::RowMajor; // NHWC
using WeiLayout = ck::tensor_layout::gemm::ColumnMajor; // KYXCK8
static constexpr bool NonTemporalStore = false;
using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
using Relu = ck::tensor_operation::cpu::element_wise::Relu;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Default;
static constexpr auto ConvFwd1x1P0 =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0;
static constexpr auto ConvFwd1x1S1P0 =
ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0;
static constexpr auto DefaultGemmKLoop =
ck::tensor_operation::cpu::device::ConvolutionForwardGemmKSpecialization_t::DefaultGemmKLoop;
static constexpr auto GemmKLoopOverC =
ck::tensor_operation::cpu::device::ConvolutionForwardGemmKSpecialization_t::NHWC_GemmKLoopOverC;
static constexpr auto LoopOver_MNK = ck::tensor_operation::cpu::device::LoopOver_MNK;
static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver_MKN;
// clang-format off
#define DEVICE_CONV2D_FWD_AVX2_NHWC_KYXCK8_NHWK_F32(a_elem_op, b_elem_op, c_elem_op, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, c_local_buf) \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, false, false, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, false, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MNK}), \
\
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, false, false, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, false, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MKN})
// clang-format on
void add_device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float, float, float, PT, PT, PT, ConvFwdDefault, 2, 4, 24, false, false, false>({0, 0, 0, DefaultGemmKLoop, LoopOver_MKN})
// clang-format on
));
}
} // namespace device_conv2d_fwd_avx2_instance
} // namespace device
} // namespace cpu
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment