Commit f6cb5b84 authored by Chao Liu's avatar Chao Liu
Browse files

debugging

parent 0983d205
...@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
#elif 1 #elif 0
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer // 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
constexpr index_t BPerBlock = 64; constexpr index_t BPerBlock = 64;
constexpr index_t KPerBlock = 128; constexpr index_t KPerBlock = 128;
...@@ -266,7 +266,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -266,7 +266,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = constexpr auto gridwise_conv =
#if 0 #if 1
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#else #else
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
......
#pragma once #pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
#include "inline_asm.hpp"
template <index_t BlockSize, class Float, class DstDesc, class F> template <index_t BlockSize, class Float, class DstDesc, class F>
__device__ void __device__ void
......
#pragma once #pragma once
#include "common.hip.hpp"
#include "threadwise_gemm.hip.hpp" #include "threadwise_gemm.hip.hpp"
// if following number are power of 2, index calculation shall be greatly reduced: // if following number are power of 2, index calculation shall be greatly reduced:
......
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
#include "Array.hip.hpp" #include "Array.hip.hpp"
#include "functional.hip.hpp" #include "functional.hip.hpp"
#if DEVICE_BACKEDN_HIP
#include "inline_asm.hpp"
#endif
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; } __device__ index_t get_block_1d_id() { return blockIdx.x; }
......
...@@ -15,7 +15,11 @@ struct vector_type<float, 1> ...@@ -15,7 +15,11 @@ struct vector_type<float, 1>
template <> template <>
struct vector_type<float, 2> struct vector_type<float, 2>
{ {
#if 1
typedef float MemoryType __attribute__((ext_vector_type(2))); typedef float MemoryType __attribute__((ext_vector_type(2)));
#else
using MemoryType = float2;
#endif
__host__ __device__ static MemoryType Pack(float s0, float s1) __host__ __device__ static MemoryType Pack(float s0, float s1)
{ {
...@@ -34,7 +38,11 @@ struct vector_type<float, 2> ...@@ -34,7 +38,11 @@ struct vector_type<float, 2>
template <> template <>
struct vector_type<float, 4> struct vector_type<float, 4>
{ {
#if 1
typedef float MemoryType __attribute__((ext_vector_type(4))); typedef float MemoryType __attribute__((ext_vector_type(4)));
#else
using MemoryType = float4;
#endif
}; };
#if 0 #if 0
......
...@@ -222,21 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn ...@@ -222,21 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block); blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block);
#elif 1
Float4 tmp_in, tmp_wei;
Float4* glb_in_p =
(Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
Float4* loc_in_p = (Float4*)(p_in_block + blockwise_in_copy.mDstMyThreadOffset);
Float4* glb_wei_p =
(Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
Float4* loc_wei_p = (Float4*)(p_wei_block + blockwise_wei_copy.mDstMyThreadOffset);
global_load(tmp_in, glb_in_p);
global_load(tmp_wei, glb_wei_p);
vmcnt(0);
ds_write_b128(tmp_in, loc_in_p);
ds_write_b128(tmp_wei, loc_wei_p);
#endif #endif
__syncthreads(); __syncthreads();
...@@ -247,11 +232,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn ...@@ -247,11 +232,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
......
#pragma once #pragma once
#include "inline_asm.hpp"
template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol> template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
__device__ void threadwise_matrix_copy(SrcMatrix, __device__ void threadwise_matrix_copy(SrcMatrix,
const Float* __restrict__ p_src, const Float* __restrict__ p_src,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment