Commit 4b448373 authored by carlushuang's avatar carlushuang
Browse files

fix bug on merge latest develop

parent b79df771
#ifndef CK_GRIDWISE_GEMM_AVX2_HPP
#define CK_GRIDWISE_GEMM_AVX2_HPP
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_avx2.hpp"
#include "threadwise_tensor_slice_transfer_avx2.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp"
#include "dynamic_buffer_cpu.hpp"
#include "envvar.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
#include "ck/utility/dynamic_buffer_cpu.hpp"
#include "ck/utility/envvar.hpp"
#include <utility>
#include <unistd.h>
#include <omp.h>
......
#ifndef CK_GRIDWISE_GEMM_BIAS_ACTIVATION_ADD_AVX2_HPP
#define CK_GRIDWISE_GEMM_BIAS_ACTIVATION_ADD_AVX2_HPP
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_avx2.hpp"
#include "threadwise_tensor_slice_transfer_avx2.hpp"
#include "threadwise_tensor_slice_transfer_avx2_specialization.hpp"
#include "dynamic_buffer_cpu.hpp"
#include "envvar.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp"
#include "ck/utility/dynamic_buffer_cpu.hpp"
#include "ck/utility/envvar.hpp"
#include <utility>
#include <unistd.h>
#include <omp.h>
......
......@@ -5,10 +5,10 @@
#if CK_USE_X86_INLINE_ASM == 0
#include <immintrin.h>
#endif
#include "common_header.hpp"
#include "../../gpu/device/tensor_layout.hpp"
#include "math.hpp"
#include "threadwise_gemm_param.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/math.hpp"
#include "ck/tensor_operation/cpu/thread/threadwise_gemm_param.hpp"
namespace ck {
namespace cpu {
......
#ifndef CK_THREADWISE_GEMM_PARAM_HPP
#define CK_THREADWISE_GEMM_PARAM_HPP
#include "common_header.hpp"
#include "math.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/math.hpp"
namespace ck {
namespace cpu {
......
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
#include "common_header.hpp"
#include "data_type_cpu.hpp"
#include "../../gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
#include "dynamic_buffer_cpu.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/data_type_cpu.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
#include "ck/utility/dynamic_buffer_cpu.hpp"
#include <immintrin.h>
namespace ck {
......
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_SPECIALIZED_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_SPECIALIZED_HPP
#include "common_header.hpp"
#include "data_type_cpu.hpp"
#include "../../gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
#include "dynamic_buffer_cpu.hpp"
#include "element_wise_operation_cpu.hpp"
#include "convolution_forward_specialization_cpu.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/data_type_cpu.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
#include "ck/utility/dynamic_buffer_cpu.hpp"
#include "ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp"
#include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include <immintrin.h>
namespace ck {
......
......@@ -2,37 +2,6 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "config.hpp"
#include "array.hpp"
#include "container_helper.hpp"
#include "statically_indexed_array.hpp"
#include "container_element_picker.hpp"
#include "multi_index.hpp"
#include "data_type.hpp"
#include "data_type_enum.hpp"
#include "data_type_enum_helper.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
#include "functional4.hpp"
#include "enable_if.hpp"
#include "ignore.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "type.hpp"
#include "magic_division.hpp"
#include "c_style_pointer_cast.hpp"
#include "is_known_at_compile_time.hpp"
#include "transpose_vectors.hpp"
#include "inner_product.hpp"
// #include "element_wise_operation.hpp"
#include "thread_group.hpp"
#include "debug.hpp"
#include "ck/ck.hpp"
#include "ck/utility/array.hpp"
......
......@@ -4,9 +4,6 @@
#pragma once
#include "ck/utility/statically_indexed_array.hpp"
#ifdef CK_NOGPU
#include "half.hpp"
#endif
namespace ck {
......
#pragma once
#include <immintrin.h>
#include "half.hpp"
namespace ck {
......
#ifndef CK_BUFFER_CPU_HPP
#define CK_BUFFER_CPU_HPP
#include "config.hpp"
#include "ck/ck.hpp"
#include "enable_if.hpp"
#include "data_type_cpu.hpp"
......
......@@ -9,6 +9,10 @@
#include "type.hpp"
#include "enable_if.hpp"
#ifndef CK_NOCPU
#include <math.h>
#endif
namespace ck {
namespace math {
......@@ -144,6 +148,7 @@ __host__ __device__ constexpr auto min(X x, Ys... ys)
return min(x, min(ys...));
}
#ifndef CK_NOGPU
// disallow implicit type casting
template <typename T>
__device__ T exp(T x);
......@@ -161,6 +166,7 @@ __device__ double exp<double>(double x)
{
return exp(x);
}
#endif
// greatest common divisor, aka highest common factor
__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
......
if(NOT CK_NOGPU)
add_subdirectory(src/tensor_operation_instance/gpu)
endif()
add_subdirectory(src/host_tensor)
add_subdirectory(src/utility)
add_subdirectory(src/tensor_operation_instance/cpu)
......@@ -3,6 +3,8 @@
#pragma once
#include <cstddef>
#ifndef CK_NOGPU
#include <hip/hip_runtime.h>
template <typename T>
......@@ -38,3 +40,20 @@ struct DeviceMem
void* mpDeviceBuf;
std::size_t mMemSize;
};
#endif
struct DeviceAlignedMemCPU
{
DeviceAlignedMemCPU() = delete;
DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
void* GetDeviceBuffer();
std::size_t GetBufferSize();
void ToDevice(const void* p);
void FromDevice(void* p);
void SetZero();
~DeviceAlignedMemCPU();
void* mpDeviceBuf;
std::size_t mMemSize;
std::size_t mAlignment;
};
......@@ -17,6 +17,7 @@ target_include_directories(host_tensor PUBLIC
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
)
if(NOT CK_NOGPU)
rocm_install(
TARGETS host_tensor
EXPORT host_tensorTargets
......@@ -28,5 +29,19 @@ rocm_install(
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
else()
install(TARGETS host_tensor
EXPORT host_tensorTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
endif()
clang_tidy_check(host_tensor)
......@@ -4,6 +4,11 @@
#include "ck/device_utility/hip_check_error.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#ifndef CK_NOGPU
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
......@@ -26,3 +31,38 @@ void DeviceMem::FromDevice(void* p)
void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
#endif
DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
: mMemSize(mem_size), mAlignment(alignment)
{
if(mem_size == 0)
{
mpDeviceBuf = nullptr;
}
else
{
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
// TODO: posix only
int rtn = posix_memalign(&mpDeviceBuf, alignment, mem_size);
assert(rtn == 0);
}
}
void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{
if(mpDeviceBuf != nullptr)
free(mpDeviceBuf);
}
......@@ -2,7 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cassert>
#include "ck/ck.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
void HostTensorDescriptor::CalculateStrides()
......
#include <stdlib.h>
#include <utility>
#include "config.hpp"
#include "convolution_forward_specialization_cpu.hpp"
#include "device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "ck/tensor_operation/cpu/device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp"
#include "ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
......@@ -58,7 +59,7 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver
void add_device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......
#include <stdlib.h>
#include <utility>
#include "convolution_forward_specialization_cpu.hpp"
#include "config.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
......@@ -64,7 +65,7 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -85,7 +86,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(std::vector<DeviceConvFwdPtr<PT,
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -106,7 +107,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -143,7 +144,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -164,7 +165,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -185,7 +186,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu(
void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......
#include <stdlib.h>
#include <utility>
#include "config.hpp"
#include "convolution_forward_specialization_cpu.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxck8_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxck8_nhwk.hpp"
#include "ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
......@@ -58,7 +59,7 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver
void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -79,7 +80,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(
void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -100,7 +101,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(
void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -137,7 +138,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt(
void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -158,7 +159,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(
void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -179,7 +180,7 @@ void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu(
void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......
#include <stdlib.h>
#include <utility>
#include "config.hpp"
#include "convolution_forward_specialization_cpu.hpp"
#include "device_convnd_fwd_avx2_nhwc_yxck_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp"
#include "ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_yxck_nhwk.hpp"
#include "ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
......@@ -56,7 +57,7 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver
void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk(std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -77,7 +78,7 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk(std::vector<DeviceConvFwdPtr<PT,
void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_local_c(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -98,7 +99,7 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_local_c(
void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_mt(
std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -136,7 +137,7 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_mt(
void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -157,7 +158,7 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_relu(
void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_local_c_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......@@ -178,7 +179,7 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_local_c_relu(
void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_mt_relu(
std::vector<DeviceConvFwdPtr<PT, PT, Relu>>& instances)
{
ck::tensor_operation::device::add_device_operation_instances(
ck::tensor_operation::device::instance::add_device_operation_instances(
instances,
std::make_tuple(
// clang-format off
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment