"...git@developer.sourcefind.cn:modelzoo/solov2-pytorch.git" did not exist on "108fc9e1e03ffa6737d1a31ef76fd13bb0cd3cc9"
Commit 5245a016 authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent 7a251a09
...@@ -592,7 +592,7 @@ int main(int argc, char* argv[]) ...@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 1 #elif 0
// 1x1 filter, 14x14 image, C = 512 // 1x1 filter, 14x14 image, C = 512
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 512; constexpr index_t C = 512;
......
#pragma once #pragma once
#include "common.hip.hpp"
typedef float Float4 __attribute__((ext_vector_type(4)));
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#define NO_VM_WAIT 0 #define NO_VM_WAIT 0
#define NO_LGKM_WAIT 0 #define NO_LGKM_WAIT 0
...@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; ...@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#define NO_DS_WRITE 0 #define NO_DS_WRITE 0
#define NO_GLB_READ 0 #define NO_GLB_READ 0
inline __device__ void vmcnt(index_t cnt) // cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
__device__ void vmcnt(index_t cnt)
{ {
#if !NO_VM_WAIT #if !NO_VM_WAIT
if(cnt == 0) if(cnt == 0)
...@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt) ...@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt)
#endif #endif
} }
inline __device__ void lgkmcnt(index_t cnt) __device__ void lgkmcnt(index_t cnt)
{ {
#if !NO_LGKM_WAIT #if !NO_LGKM_WAIT
if(cnt == 0) if(cnt == 0)
...@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt) ...@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt)
#endif #endif
} }
inline __device__ void outerProduct1x4(const float* a, const float* b, float* c) __device__ void outerProduct1x4(const float* a, const float* b, float* c)
{ {
asm volatile("\n \ asm volatile("\n \
v_mac_f32 %0, %4, %5 \n \ v_mac_f32 %0, %4, %5 \n \
...@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c) ...@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
"3"(c[3])); "3"(c[3]));
} }
inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& c) __device__ void outerProduct1x4(const float& a,
const vector_type<float, 4>::MemoryType& b,
vector_type<float, 4>::MemoryType& c)
{ {
#if 0 #if 0
asm volatile( asm volatile(
...@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& ...@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4&
#endif #endif
} }
inline __device__ void __device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4& c2, Float4& c3) const vector_type<float, 4>::MemoryType& b,
vector_type<float, 4>::MemoryType& c0,
vector_type<float, 4>::MemoryType& c1,
vector_type<float, 4>::MemoryType& c2,
vector_type<float, 4>::MemoryType& c3)
{ {
#if 0 #if 0
asm volatile( asm volatile(
...@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4 ...@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4
#endif #endif
} }
inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* c) __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
const vector_type<float, 4>::MemoryType* b,
vector_type<float, 4>::MemoryType* c)
{ {
outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]); outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]); outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
...@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* ...@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]); outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
} }
inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0) __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
{ {
#if !NO_DS_READ #if !NO_DS_READ
if(offset == 0) if(offset == 0)
...@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0) ...@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
#endif #endif
} }
inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0) __device__ void global_load(vector_type<float, 4>::MemoryType& r,
const vector_type<float, 4>::MemoryType* ptr,
index_t offset = 0)
{ {
#if !NO_GLB_READ #if !NO_GLB_READ
if(offset == 0) if(offset == 0)
...@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset ...@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset
#endif #endif
} }
inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0) __device__ void
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
{ {
#if !NO_DS_WRITE #if !NO_DS_WRITE
if(offset == 0) if(offset == 0)
......
...@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
const FloatB* __restrict__ p_b_block, const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread) const FloatC* __restrict__ p_c_thread) const
{ {
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
is_same<FloatC, float>::value,
"Run_asm only deal with float\n");
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
...@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{}); Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
MPerThread == 8 && NPerThread == 8,
"Run_asm cannot deal with this GEMM shape yet\n");
using Float4 = vector_type<float, 4>::MemoryType;
float p_thread[a_thread_mtx.GetElementSpace() + b_thread_mtx.GetElementSpace()]; float p_thread[a_thread_mtx.GetElementSpace() + b_thread_mtx.GetElementSpace()];
FloatA* p_a_thread = p_thread; FloatA* p_a_thread = p_thread;
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include "Array.hip.hpp" #include "Array.hip.hpp"
#include "functional.hip.hpp" #include "functional.hip.hpp"
#if DEVICE_BACKEDN_HIP #if DEVICE_BACKEND_HIP
#include "inline_asm.hpp" #include "amd_inline_asm.hip.hpp"
#endif #endif
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
...@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs) ...@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
return x < y ? x : y; return x < y ? x : y;
} }
} // namespace mod_conv } // namespace mod_conv
#if DEVICE_BACKEND_HIP
// cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#endif
...@@ -16,12 +16,14 @@ template <> ...@@ -16,12 +16,14 @@ template <>
struct vector_type<float, 2> struct vector_type<float, 2>
{ {
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction // For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float MemoryType __attribute__((ext_vector_type(2))); typedef float MemoryType __attribute__((ext_vector_type(2)));
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise // For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and // compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction, // kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using MemoryType = float2; using MemoryType = float2;
#endif #endif
...@@ -43,12 +45,14 @@ template <> ...@@ -43,12 +45,14 @@ template <>
struct vector_type<float, 4> struct vector_type<float, 4>
{ {
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction // For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float MemoryType __attribute__((ext_vector_type(4))); typedef float MemoryType __attribute__((ext_vector_type(4)));
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise // For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and // compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction, // kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using MemoryType = float4; using MemoryType = float4;
#endif #endif
}; };
......
...@@ -34,11 +34,6 @@ template <index_t GridSize, ...@@ -34,11 +34,6 @@ template <index_t GridSize,
index_t WeiBlockCopyDataPerRead> index_t WeiBlockCopyDataPerRead>
struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
__host__
__device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer()
{
}
__device__ void Run(const Float* const __restrict__ p_in_global, __device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global, const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global) const Float* const __restrict__ p_out_global) const
...@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// preload data into LDS // preload data into LDS
{ {
#if 1
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
...@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard); p_wei_register_clipboard);
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, #if 0
p_in_block_double); blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double); p_wei_block_double);
#elif 0 #elif 1
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
p_in_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard);
vmcnt(0); vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double); p_in_block_double);
...@@ -278,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -278,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
...@@ -291,15 +277,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -291,15 +277,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 1 #if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next); p_wei_block_next);
#elif 0 #elif 1
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt(0); vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
...@@ -329,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -329,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
...@@ -342,19 +326,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -342,19 +326,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 1 #if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space); p_wei_block_double + wei_block_space);
#else #else
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt(0); vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
p_wei_block_double + wei_block_space); p_wei_block_double + wei_block_space);
#endif #endif
...@@ -366,7 +347,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -366,7 +347,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment