Commit 5245a016 authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent 7a251a09
......@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
#elif 0
// 1x1 filter, 14x14 image, C = 512
constexpr index_t N = 128;
constexpr index_t C = 512;
......
#pragma once
typedef float Float4 __attribute__((ext_vector_type(4)));
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#include "common.hip.hpp"
#define NO_VM_WAIT 0
#define NO_LGKM_WAIT 0
......@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#define NO_DS_WRITE 0
#define NO_GLB_READ 0
inline __device__ void vmcnt(index_t cnt)
// cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
__device__ void vmcnt(index_t cnt)
{
#if !NO_VM_WAIT
if(cnt == 0)
......@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt)
#endif
}
inline __device__ void lgkmcnt(index_t cnt)
__device__ void lgkmcnt(index_t cnt)
{
#if !NO_LGKM_WAIT
if(cnt == 0)
......@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt)
#endif
}
inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
__device__ void outerProduct1x4(const float* a, const float* b, float* c)
{
asm volatile("\n \
v_mac_f32 %0, %4, %5 \n \
......@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
"3"(c[3]));
}
inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& c)
__device__ void outerProduct1x4(const float& a,
const vector_type<float, 4>::MemoryType& b,
vector_type<float, 4>::MemoryType& c)
{
#if 0
asm volatile(
......@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4&
#endif
}
inline __device__ void
outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4& c2, Float4& c3)
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
const vector_type<float, 4>::MemoryType& b,
vector_type<float, 4>::MemoryType& c0,
vector_type<float, 4>::MemoryType& c1,
vector_type<float, 4>::MemoryType& c2,
vector_type<float, 4>::MemoryType& c3)
{
#if 0
asm volatile(
......@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4
#endif
}
inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* c)
__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
const vector_type<float, 4>::MemoryType* b,
vector_type<float, 4>::MemoryType* c)
{
outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
......@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
}
inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
{
#if !NO_DS_READ
if(offset == 0)
......@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
#endif
}
inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0)
__device__ void global_load(vector_type<float, 4>::MemoryType& r,
const vector_type<float, 4>::MemoryType* ptr,
index_t offset = 0)
{
#if !NO_GLB_READ
if(offset == 0)
......@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset
#endif
}
inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0)
__device__ void
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
{
#if !NO_DS_WRITE
if(offset == 0)
......
......@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread) const
{
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
is_same<FloatC, float>::value,
"Run_asm only deal with float\n");
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
......@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
MPerThread == 8 && NPerThread == 8,
"Run_asm cannot deal with this GEMM shape yet\n");
using Float4 = vector_type<float, 4>::MemoryType;
float p_thread[a_thread_mtx.GetElementSpace() + b_thread_mtx.GetElementSpace()];
FloatA* p_a_thread = p_thread;
......
......@@ -5,8 +5,8 @@
#include "Array.hip.hpp"
#include "functional.hip.hpp"
#if DEVICE_BACKEDN_HIP
#include "inline_asm.hpp"
#if DEVICE_BACKEND_HIP
#include "amd_inline_asm.hip.hpp"
#endif
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
......@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
return x < y ? x : y;
}
} // namespace mod_conv
#if DEVICE_BACKEND_HIP
// cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#endif
......@@ -16,12 +16,14 @@ template <>
struct vector_type<float, 2>
{
#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float MemoryType __attribute__((ext_vector_type(2)));
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using MemoryType = float2;
#endif
......@@ -43,12 +45,14 @@ template <>
struct vector_type<float, 4>
{
#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float MemoryType __attribute__((ext_vector_type(4)));
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using MemoryType = float4;
#endif
};
......
......@@ -34,11 +34,6 @@ template <index_t GridSize,
index_t WeiBlockCopyDataPerRead>
struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
__host__
__device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer()
{
}
__device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global) const
......@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// preload data into LDS
{
#if 1
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
......@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard);
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double);
#if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double);
#elif 0
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
p_in_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard);
#elif 1
vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double);
......@@ -278,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
for(index_t x = 0; x < X; ++x)
{
#if 1
#if 0
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
......@@ -291,15 +277,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
#if 1
#if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next);
#elif 0
// if work with RunLoadRegisterClipboard_asm, need to wait
#elif 1
vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
......@@ -329,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
for(index_t x = 0; x < X; ++x)
{
#if 1
#if 0
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
......@@ -342,19 +326,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
#if 1
#if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space);
#else
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
p_wei_block_double + wei_block_space);
#endif
......@@ -366,7 +347,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
for(index_t x = 0; x < X; ++x)
{
#if 1
#if 0
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment