"vscode:/vscode.git/clone" did not exist on "3f0bc6812b0721ff68b0582172a421b5c93a873f"
Unverified Commit 562e1e27 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

MIOpen integration: recent bug fixes from MIOpen (#5)

parent 52c3fe05
......@@ -114,7 +114,7 @@ struct ThreadwiseGemmTransANormalBNormalC
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
__outer_product_1x2(
amd_assembly_outer_product_1x2(
p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
});
......@@ -129,7 +129,7 @@ struct ThreadwiseGemmTransANormalBNormalC
const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
__outer_product_1x4(p_a[aindex],
amd_assembly_outer_product_1x4(p_a[aindex],
p_b[bindex_0],
p_b[bindex_1],
p_b[bindex_2],
......
......@@ -123,7 +123,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
fwd(p_src), src_coord.GetOffset(), 0);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
......@@ -162,7 +162,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>(
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst),
dst_coord.GetOffset(),
......@@ -311,7 +311,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
......@@ -503,7 +503,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>(
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst,
dst_nonlinear_coord.GetOffset(),
......
......@@ -335,7 +335,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
vector_data = __buffer_load<SrcData, SrcDataPerAccess>(
vector_data = amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
fwd(p_src), src_merged_offset, src_normal_offset);
#else
vector_data = *reinterpret_cast<const src_vector_t*>(
......@@ -375,7 +375,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// copy data from buffer into dst
{
using dst_vector_t = typename vector_type<SrcData, DstDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
......@@ -420,7 +420,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
reinterpret_cast<SrcData*>(&vector_data)[i] = p_dst_buffer[buffer_offset];
reinterpret_cast<DstData*>(&vector_data)[i] = p_dst_buffer[buffer_offset];
}
// offset w.r.t. normal dimension is known at compile-time
......@@ -444,7 +444,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<SrcData, DstDataPerAccess>(
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
vector_data, fwd(p_dst), dst_merged_offset, dst_normal_offset);
#else
*reinterpret_cast<dst_vector_t*>(
......
......@@ -19,53 +19,54 @@ __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load");
bool slc) __asm("llvm.amdgcn.buffer.load.f32");
__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");
__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
__device__ void __llvm_amdgcn_buffer_store(float vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store");
bool slc) __asm("llvm.amdgcn.buffer.store.f32");
__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");
__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
__device__ typename vector_type<T, VectorSize>::MemoryType amd_intrinsic_buffer_load(
const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
__device__ void
amd_intrinsic_buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset);
template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block,
__device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
......@@ -100,7 +101,7 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
}
template <>
__device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
__device__ float2_t amd_intrinsic_buffer_load<float, 2>(const float* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
......@@ -135,7 +136,7 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
}
template <>
__device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
__device__ float4_t amd_intrinsic_buffer_load<float, 4>(const float* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
......@@ -170,7 +171,7 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
}
template <>
__device__ void __buffer_store<float, 1>(const float& src,
__device__ void amd_intrinsic_buffer_store<float, 1>(const float& src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
......@@ -207,7 +208,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
}
template <>
__device__ void __buffer_store<float, 2>(const float2_t& src,
__device__ void amd_intrinsic_buffer_store<float, 2>(const float2_t& src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
......@@ -244,7 +245,7 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
}
template <>
__device__ void __buffer_store<float, 4>(const float4_t& src,
__device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
......
......@@ -6,7 +6,7 @@
namespace ck {
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
{
// disable inline asm due to the compiler issue: SWDEV-202749
///\to-do: enable the inline asm after the compiler fix
......@@ -24,7 +24,7 @@ __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, floa
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x4(
__device__ void amd_assembly_outer_product_1x4(
float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
{
asm volatile("\n \
......@@ -38,11 +38,12 @@ __device__ void __outer_product_1x4(
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
__device__ void
amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
{
asm volatile("\n \
v_dot2_f32_f16 %0, %2, %3 %0\n \
v_dot2_f32_f16 %1, %2, %4 %1\n \
v_dot2_f32_f16 %0, %2, %3, %0\n \
v_dot2_f32_f16 %1, %2, %4, %1\n \
"
: "=v"(c0), "=v"(c1) // Dest registers
: "v"(a), // 1st Src register for 1 half2 registers
......@@ -53,7 +54,8 @@ __device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
__device__ void
amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
{
const half2_t* p_a_half2 = reinterpret_cast<const half2_t*>(&a);
const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
......@@ -61,10 +63,10 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
// do dot2 two times
asm volatile("\n \
v_dot2_f32_f16 %0, %2, %4 %0\n \
v_dot2_f32_f16 %1, %2, %6 %1\n \
v_dot2_f32_f16 %0, %3, %5 %0\n \
v_dot2_f32_f16 %1, %3, %7 %1\n \
v_dot2_f32_f16 %0, %2, %4, %0\n \
v_dot2_f32_f16 %1, %2, %6, %1\n \
v_dot2_f32_f16 %0, %3, %5, %0\n \
v_dot2_f32_f16 %1, %3, %7, %1\n \
"
: "=v"(c0), "=v"(c1) // Dest registers
: "v"(p_a_half2[0]),
......@@ -78,7 +80,7 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x4(half2_t a,
__device__ void amd_assembly_outer_product_1x4(half2_t a,
half2_t b0,
half2_t b1,
half2_t b2,
......@@ -89,10 +91,10 @@ __device__ void __outer_product_1x4(half2_t a,
float& c3)
{
asm volatile("\n \
v_dot2_f32_f16 %0, %4, %5 %0\n \
v_dot2_f32_f16 %1, %4, %6 %1\n \
v_dot2_f32_f16 %2, %4, %7 %2\n \
v_dot2_f32_f16 %3, %4, %8 %3\n \
v_dot2_f32_f16 %0, %4, %5, %0\n \
v_dot2_f32_f16 %1, %4, %6, %1\n \
v_dot2_f32_f16 %2, %4, %7, %2\n \
v_dot2_f32_f16 %3, %4, %8, %3\n \
"
: "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers
: "v"(a), // 1st Src register for 1 half2 registers
......@@ -107,7 +109,7 @@ __device__ void __outer_product_1x4(half2_t a,
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x4(half4_t a,
__device__ void amd_assembly_outer_product_1x4(half4_t a,
half4_t b0,
half4_t b1,
half4_t b2,
......@@ -125,14 +127,14 @@ __device__ void __outer_product_1x4(half4_t a,
// do dot2 two times
asm volatile("\n \
v_dot2_f32_f16 %0, %4, %6 %0\n \
v_dot2_f32_f16 %1, %4, %8 %1\n \
v_dot2_f32_f16 %2, %4, %10 %2\n \
v_dot2_f32_f16 %3, %4, %12 %3\n \
v_dot2_f32_f16 %0, %5, %7 %0\n \
v_dot2_f32_f16 %1, %5, %9 %1\n \
v_dot2_f32_f16 %2, %5, %11 %2\n \
v_dot2_f32_f16 %3, %5, %13 %3\n \
v_dot2_f32_f16 %0, %4, %6, %0\n \
v_dot2_f32_f16 %1, %4, %8, %1\n \
v_dot2_f32_f16 %2, %4, %10, %2\n \
v_dot2_f32_f16 %3, %4, %12, %3\n \
v_dot2_f32_f16 %0, %5, %7, %0\n \
v_dot2_f32_f16 %1, %5, %9, %1\n \
v_dot2_f32_f16 %2, %5, %11, %2\n \
v_dot2_f32_f16 %3, %5, %13, %3\n \
"
: "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers
: "v"(p_a_half2[0]),
......
......@@ -24,4 +24,8 @@
#include "amd_buffer_addressing.hpp"
#endif
#if CK_USE_AMD_XDLOPS
#include "amd_xdlops.hpp"
#endif
#endif
......@@ -31,11 +31,11 @@
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
#define CK_USE_AMD_XDLOPS 1
#define CK_USE_AMD_XDLOPS 0
#endif
#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
#endif
// experimental implementation
......
#ifndef CK_FLOAT_TYPE_AMD_HPP
#define CK_FLOAT_TYPE_AMD_HPP
namespace ck {
// For some reason, HIP compiler need this definition to generate optimal ISA
// float
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float16_t __attribute__((ext_vector_type(16)));
typedef float float32_t __attribute__((ext_vector_type(32)));
// float16
typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
// bfloat16
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
template <class T, index_t N>
struct vector_type
{
typedef struct
{
T scalar[N];
} MemoryType;
};
template <>
struct vector_type<float, 1>
{
using MemoryType = float;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
};
template <>
struct vector_type<float, 2>
{
using MemoryType = float2_t;
union DataType
{
MemoryType vector;
float scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<float, 4>
{
using MemoryType = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
};
template <>
struct vector_type<half, 1>
{
using MemoryType = half;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
};
template <>
struct vector_type<half, 2>
{
using MemoryType = half2_t;
union DataType
{
MemoryType vector;
half scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<half, 4>
{
using MemoryType = half4_t;
union DataType
{
MemoryType vector;
half scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <>
struct vector_type<ushort, 1>
{
using MemoryType = ushort;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
};
template <>
struct vector_type<ushort, 2>
{
using MemoryType = ushort2_t;
union DataType
{
MemoryType vector;
ushort scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<ushort, 4>
{
using MemoryType = ushort4_t;
union DataType
{
MemoryType vector;
ushort scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
// data type conversion
template <typename T>
struct type_convert
{
template <typename X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
template <>
template <>
__device__ float type_convert<float>::operator()<ushort>(ushort x) const
{
return bfloat16_to_float(x);
}
template <>
template <>
__device__ ushort type_convert<ushort>::operator()<float>(float x) const
{
return float_to_bfloat16(x);
}
template <typename T>
struct inner_product_with_conversion
{
static constexpr auto convert = type_convert<T>();
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(half2_t a, half2_t b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(half4_t a, half4_t b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(ushort2_t a, ushort2_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
__device__ T operator()(ushort4_t a, ushort4_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
};
} // namespace ck
#endif
......@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128;
......@@ -343,7 +343,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>;
#elif 0
#elif 1
// 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128;
constexpr index_t C = 128;
......@@ -450,7 +450,7 @@ int main(int argc, char* argv[])
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 0
#elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......@@ -492,7 +492,7 @@ int main(int argc, char* argv[])
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 1
#elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment