"sgl-kernel/git@developer.sourcefind.cn:change/sglang.git" did not exist on "c23a7072b6dcec38e54482fe3b50cfb06d1d9139"
Commit f6cb5b84 authored by Chao Liu's avatar Chao Liu
Browse files

debugging

parent 0983d205
......@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t BlockSize = 256;
#elif 1
#elif 0
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
constexpr index_t BPerBlock = 64;
constexpr index_t KPerBlock = 128;
......@@ -266,7 +266,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for(index_t i = 0; i < nrepeat; ++i)
{
constexpr auto gridwise_conv =
#if 0
#if 1
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#else
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
......
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "inline_asm.hpp"
template <index_t BlockSize, class Float, class DstDesc, class F>
__device__ void
......
#pragma once
#include "common.hip.hpp"
#include "threadwise_gemm.hip.hpp"
// if following number are power of 2, index calculation shall be greatly reduced:
......
......@@ -5,6 +5,10 @@
#include "Array.hip.hpp"
#include "functional.hip.hpp"
#if DEVICE_BACKEDN_HIP
#include "inline_asm.hpp"
#endif
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
......
......@@ -15,7 +15,11 @@ struct vector_type<float, 1>
template <>
struct vector_type<float, 2>
{
#if 1
typedef float MemoryType __attribute__((ext_vector_type(2)));
#else
using MemoryType = float2;
#endif
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
......@@ -34,7 +38,11 @@ struct vector_type<float, 2>
template <>
struct vector_type<float, 4>
{
#if 1
typedef float MemoryType __attribute__((ext_vector_type(4)));
#else
using MemoryType = float4;
#endif
};
#if 0
......
......@@ -222,21 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block);
#elif 1
Float4 tmp_in, tmp_wei;
Float4* glb_in_p =
(Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
Float4* loc_in_p = (Float4*)(p_in_block + blockwise_in_copy.mDstMyThreadOffset);
Float4* glb_wei_p =
(Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
Float4* loc_wei_p = (Float4*)(p_wei_block + blockwise_wei_copy.mDstMyThreadOffset);
global_load(tmp_in, glb_in_p);
global_load(tmp_wei, glb_wei_p);
vmcnt(0);
ds_write_b128(tmp_in, loc_in_p);
ds_write_b128(tmp_wei, loc_wei_p);
#endif
__syncthreads();
......@@ -247,11 +232,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
{
for(index_t x = 0; x < X; ++x)
{
#if 0
#if 1
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1
#elif 0
blockwise_gemm.Run_asm
#endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
......
#pragma once
#include "inline_asm.hpp"
template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
__device__ void threadwise_matrix_copy(SrcMatrix,
const Float* __restrict__ p_src,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment