Unverified Commit 81c942cd authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Deprecate static kernel (#42)

* deprecate static kernels
parent b8b2d0a6
#ifndef CK_THREADWISE_GENERIC_TENSOR_OP_HPP
#define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor_deprecated.hpp"
namespace ck {
template <class Float, class TDesc>
__device__ void threadwise_generic_tensor_set_zero(TDesc, Float* __restrict__ p)
{
static_ford<decltype(TDesc::GetLengths())>{}([&](auto multi_id) {
constexpr index_t offset = TDesc::GetOffsetFromMultiIndex(multi_id);
p[offset] = static_cast<Float>(0);
});
}
} // namespace ck
#endif
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#define CK_XDLOPS_GEMM_HPP #define CK_XDLOPS_GEMM_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "math.hpp" #include "math.hpp"
#include "amd_xdlops.hpp" #include "amd_xdlops.hpp"
......
...@@ -23,6 +23,48 @@ amd_inner_product_dlop<float, float, float>(const float& a, const float& b, floa ...@@ -23,6 +23,48 @@ amd_inner_product_dlop<float, float, float>(const float& a, const float& b, floa
#endif #endif
} }
template <>
__device__ void
amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
vector_type<float, 2>{b}.AsType<float>()[I0],
c);
amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
vector_type<float, 2>{b}.AsType<float>()[I1],
c);
}
template <>
__device__ void
amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
vector_type<float, 4>{b}.AsType<float>()[I0],
c);
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
vector_type<float, 4>{b}.AsType<float>()[I1],
c);
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
vector_type<float, 4>{b}.AsType<float>()[I2],
c);
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
vector_type<float, 4>{b}.AsType<float>()[I3],
c);
}
#if CK_USE_AMD_DLOP #if CK_USE_AMD_DLOP
template <> template <>
__device__ void __device__ void
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#include "functional2.hpp" #include "functional2.hpp"
#include "functional3.hpp" #include "functional3.hpp"
#include "functional4.hpp" #include "functional4.hpp"
#include "in_memory_operation.hpp"
#include "integral_constant.hpp" #include "integral_constant.hpp"
#include "math.hpp" #include "math.hpp"
#include "number.hpp" #include "number.hpp"
...@@ -25,6 +24,7 @@ ...@@ -25,6 +24,7 @@
#include "type.hpp" #include "type.hpp"
#include "utility.hpp" #include "utility.hpp"
#include "magic_division.hpp" #include "magic_division.hpp"
#include "amd_buffer_addressing_v2.hpp"
#include "static_buffer.hpp" #include "static_buffer.hpp"
#include "dynamic_buffer.hpp" #include "dynamic_buffer.hpp"
......
This diff is collapsed.
#ifndef CK_SYNCHRONIZATION_NVIDIA_HPP
#define CK_SYNCHRONIZATION_NVIDIA_HPP
#include "config.hpp"
namespace ck {
__device__ void block_sync_lds() { __syncthreads(); }
__device__ void block_sync_lds_vmem() { __syncthreads(); }
} // namespace ck
#endif
extern "C" __global__ void
gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer(
const void* const __restrict__ p_in_global,
const void* const __restrict__ p_wei_global,
void* const __restrict__ p_out_global){
};
extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
const void* const __restrict__ p_in_global,
const void* const __restrict__ p_wei_global,
void* const __restrict__ p_out_global){
};
extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
const void* const __restrict__ p_in_global,
const void* const __restrict__ p_wei_global,
void* const __restrict__ p_out_global){
};
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment