Commit 4b448373 authored by carlushuang's avatar carlushuang
Browse files

fix bug on merge latest develop

parent b79df771
...@@ -84,6 +84,7 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH ) ...@@ -84,6 +84,7 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}") message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
endif() endif()
message(STATUS "Build with HIP ${HIP_VERSION}") message(STATUS "Build with HIP ${HIP_VERSION}")
endif()
## tidy ## tidy
include(EnableCompilerWarnings) include(EnableCompilerWarnings)
...@@ -251,10 +252,12 @@ message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") ...@@ -251,10 +252,12 @@ message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR}) add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
if(NOT CK_NOGPU)
rocm_package_setup_component(tests rocm_package_setup_component(tests
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name PACKAGE_NAME tests # Prevent -static suffix on package name
) )
endif()
add_subdirectory(library) add_subdirectory(library)
add_subdirectory(example) add_subdirectory(example)
...@@ -277,6 +280,7 @@ configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in ...@@ -277,6 +280,7 @@ configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
NO_CHECK_REQUIRED_COMPONENTS_MACRO NO_CHECK_REQUIRED_COMPONENTS_MACRO
) )
if(NOT CK_NOGPU)
rocm_install(FILES rocm_install(FILES
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
...@@ -293,3 +297,4 @@ rocm_create_package( ...@@ -293,3 +297,4 @@ rocm_create_package(
LDCONFIG LDCONFIG
HEADER_ONLY HEADER_ONLY
) )
endif()
#include <sstream> #include <sstream>
#include <iomanip> #include <iomanip>
#include "config.hpp" #include "ck/ck.hpp"
#include "device.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "host_tensor.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "host_tensor_generator.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp" #include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "device_tensor.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp" #include "ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp"
#include "reference_conv_fwd.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "element_wise_operation_cpu.hpp" #include "ck/utility/dynamic_buffer_cpu.hpp"
#include "dynamic_buffer_cpu.hpp" #include "ck/utility/envvar.hpp"
#include "envvar.hpp" #include "ck/device_utility/xdnn_desc.hpp"
#include "xdnn_desc.hpp"
#include <omp.h> #include <omp.h>
#define AVX2_DATA_ALIGNMENT 32 #define AVX2_DATA_ALIGNMENT 32
......
#include <sstream> #include <sstream>
#include "config.hpp" #include "ck/ck.hpp"
#include "device.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "host_tensor.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "host_tensor_generator.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp" #include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "device_tensor.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp" #include "ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp"
#include "reference_conv_fwd_bias_activation_add.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
#include "reference_conv_fwd_bias_activation.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
#include "element_wise_operation_cpu.hpp" #include "ck/utility/dynamic_buffer_cpu.hpp"
#include "dynamic_buffer_cpu.hpp" #include "ck/utility/envvar.hpp"
#include "envvar.hpp" #include "ck/device_utility/xdnn_desc.hpp"
#include "xdnn_desc.hpp"
#include <omp.h> #include <omp.h>
#define AVX2_DATA_ALIGNMENT 32 #define AVX2_DATA_ALIGNMENT 32
......
...@@ -3,11 +3,10 @@ ...@@ -3,11 +3,10 @@
#pragma once #pragma once
#include "ck/options.hpp"
#ifdef CK_NOGPU #ifdef CK_NOGPU
#define __host__ #define __host__
#define __device__ #define __device__
#include <stdint.h>
#else #else
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h" #include "hip/hip_fp16.h"
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#ifndef CK_NOGPU
#include <string> #include <string>
#include <map> #include <map>
...@@ -52,3 +53,4 @@ inline std::string get_device_name() ...@@ -52,3 +53,4 @@ inline std::string get_device_name()
} }
} // namespace ck } // namespace ck
#endif
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#ifndef CK_NOGPU
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
...@@ -15,3 +16,4 @@ inline void hip_check_error(hipError_t x) ...@@ -15,3 +16,4 @@ inline void hip_check_error(hipError_t x)
throw std::runtime_error(ss.str()); throw std::runtime_error(ss.str());
} }
} }
#endif
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
#pragma once #pragma once
#include <chrono>
#ifndef CK_NOGPU
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#include "ck/ck.hpp" #include "ck/ck.hpp"
...@@ -72,3 +75,32 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -72,3 +75,32 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
return 0; return 0;
#endif #endif
} }
#endif
template <typename... Args, typename F>
void launch_cpu_kernel(F kernel, Args... args)
{
kernel(args...);
}
template <typename... Args, typename F>
float launch_and_time_cpu_kernel(F kernel, int nrepeat, Args... args)
{
int nwarmup = 3;
for(int i = 0; i < nwarmup; i++)
kernel(args...);
auto mStart = std::chrono::high_resolution_clock::now();
for(int i = 0; i < nrepeat; i++)
{
kernel(args...);
}
auto mStop = std::chrono::high_resolution_clock::now();
float ms = static_cast<float>(
std::chrono::duration_cast<std::chrono::microseconds>(mStop - mStart).count()) *
1e-3;
return ms / nrepeat;
}
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <functional> #include <functional>
#include <string.h>
#define XDNN_OK 0 #define XDNN_OK 0
#define XDNN_FAIL 1 #define XDNN_FAIL 1
......
#ifndef CK_BLOCKWISE_GEMM_AVX2_HPP #ifndef CK_BLOCKWISE_GEMM_AVX2_HPP
#define CK_BLOCKWISE_GEMM_AVX2_HPP #define CK_BLOCKWISE_GEMM_AVX2_HPP
#include "common_header.hpp" #include "ck/utility/common_header.hpp"
#include "multi_index_transform_helper.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
namespace ck { namespace ck {
namespace cpu { namespace cpu {
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define DEVICE_BASE_CPU_HPP #define DEVICE_BASE_CPU_HPP
#include <string> #include <string>
#include "stream_config.hpp" #include "ck/stream_config.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -2,8 +2,10 @@ ...@@ -2,8 +2,10 @@
#define DEVICE_CONV_FWD_CPU_HPP #define DEVICE_CONV_FWD_CPU_HPP
#include <iostream> #include <iostream>
#include "device_base_cpu.hpp" #include <memory>
#include "convolution_forward_specialization_cpu.hpp" #include <vector>
#include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -4,17 +4,20 @@ ...@@ -4,17 +4,20 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#include "device.hpp" #include <memory>
#include "device_base_cpu.hpp" #include <vector>
#include "device_conv_fwd_cpu.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "convolution_forward_specialization_cpu.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "../../gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "gridwise_direct_conv_avx2.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/grid/gridwise_direct_conv_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -4,17 +4,20 @@ ...@@ -4,17 +4,20 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#include "device.hpp" #include <memory>
#include "device_base_cpu.hpp" #include <vector>
#include "device_conv_fwd_cpu.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "convolution_forward_specialization_cpu.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "../../gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "gridwise_gemm_avx2.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -4,17 +4,20 @@ ...@@ -4,17 +4,20 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#include "device.hpp" #include <memory>
#include "device_base_cpu.hpp" #include <vector>
#include "device_conv_fwd_cpu.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "convolution_forward_specialization_cpu.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "../../gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "gridwise_gemm_avx2.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -4,17 +4,20 @@ ...@@ -4,17 +4,20 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#include "device.hpp" #include <memory>
#include "device_base_cpu.hpp" #include <vector>
#include "device_conv_fwd_cpu.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "convolution_forward_specialization_cpu.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "../../gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "gridwise_gemm_avx2.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -4,17 +4,20 @@ ...@@ -4,17 +4,20 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#include "device.hpp" #include <memory>
#include "device_base_cpu.hpp" #include <vector>
#include "device_conv_fwd_cpu.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "convolution_forward_specialization_cpu.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "../../gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "gridwise_gemm_bias_activation_add_avx2.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/grid/gridwise_gemm_bias_activation_add_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -4,17 +4,20 @@ ...@@ -4,17 +4,20 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#include "device.hpp" #include <memory>
#include "device_base_cpu.hpp" #include <vector>
#include "device_conv_fwd_cpu.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "convolution_forward_specialization_cpu.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "../../gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "gridwise_gemm_bias_activation_add_avx2.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/grid/gridwise_gemm_bias_activation_add_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
...@@ -4,17 +4,20 @@ ...@@ -4,17 +4,20 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#include "device.hpp" #include <memory>
#include "device_base_cpu.hpp" #include <vector>
#include "device_conv_fwd_cpu.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "convolution_forward_specialization_cpu.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/cpu/device/device_base_cpu.hpp"
#include "../../gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "gridwise_gemm_bias_activation_add_avx2.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "threadwise_gemm_avx2.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/grid/gridwise_gemm_bias_activation_add_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
#pragma once #pragma once
#include "data_type_cpu.hpp" #include "ck/utility/data_type_cpu.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
#ifndef CK_GRIDWISE_DIRECT_CONV_AVX2_HPP #ifndef CK_GRIDWISE_DIRECT_CONV_AVX2_HPP
#define CK_GRIDWISE_DIRECT_CONV_AVX2_HPP #define CK_GRIDWISE_DIRECT_CONV_AVX2_HPP
#include "common_header.hpp" #include "ck/utility/common_header.hpp"
#include "multi_index_transform_helper.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "blockwise_gemm_avx2.hpp" #include "ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp"
#include "threadwise_tensor_slice_transfer_avx2.hpp" #include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp" #include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
#include "dynamic_buffer_cpu.hpp" #include "ck/utility/dynamic_buffer_cpu.hpp"
#include "envvar.hpp" #include "ck/utility/envvar.hpp"
#include <utility> #include <utility>
#include <unistd.h> #include <unistd.h>
#include <omp.h> #include <omp.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment