Commit 40d038e9 authored by Jing Zhang's avatar Jing Zhang
Browse files

clean

parent c3d05c0c
......@@ -27,7 +27,7 @@ using DeviceGemmV2Instance =
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CElementOp, GemmDefault,
#if 0
#if 1
64,
16, 16,
256, 8, 16,
......
......@@ -256,7 +256,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
// get_rtol<CDataType>(),
// get_atol<CDataType>());
LogRangeAsType<float>(std::cout << "c_m_n_device_buf : ", c_m_n_device_result.mData, ",") << std::endl;
//for(int i = 0; i < M; i++)
//{
// for(int j = 0; j < N; j++)
// {
// std::cout << ck::type_convert<float>(c_m_n_device_result(i, j)) << ",";
// }
// std::cout << std::endl;
//}
#endif
}
......
......@@ -25,7 +25,7 @@ struct PassThroughPack2
__host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
{
#if 1
#if 0
uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
uint8_t x_l = (x_u8 & 0x0f) >> 0;
uint8_t x_h = (x_u8 & 0xf0) >> 4;
......
......@@ -151,6 +151,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
using ThisThreadBlock = ThisThreadBlock<BlockSize>;
static constexpr index_t APackedSize = []() {
if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
return 2;
else
return 1;
}();
static constexpr index_t BPackedSize = []() {
if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
return 2;
else
return 1;
}();
__host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
{
return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
......@@ -625,9 +639,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
// in some cases.
else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
{
constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
? 1
: 32 * 4 / KPerBlock / sizeof(ADataType);
constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(ADataType);
constexpr auto MLdsLayer = LdsSize < 1 ? 1 : LdsSize;
constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
make_tuple(
AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
......@@ -761,10 +774,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
{
// NLdsLayer * K0 as logical Bank
constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
? 1
: 32 * 4 / KPerBlock / sizeof(BDataType);
;
constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(BDataType);
constexpr auto NLdsLayer = LdsSize < 1 ? 1 : LdsSize;
constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
make_tuple(
BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
......@@ -924,20 +935,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
NXdlPerWave,
KPack>())>;
static constexpr index_t APackedSize = []() {
if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
return 2;
else
return 1;
}();
static constexpr index_t BPackedSize = []() {
if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
return 2;
else
return 1;
}();
__device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
// LDS allocation for A and B: be careful of alignment
......@@ -1326,8 +1323,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
bit_cast<BDataType*>(bit_cast<unsigned char *>(p_shared) +
a_block_space_size_aligned * sizeof(ADataType)),
reinterpret_cast<BDataType*>(static_cast<ADataType*>(p_shared) +
a_block_space_size_aligned),
b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
......
......@@ -1211,10 +1211,6 @@ struct ThreadwiseTensorSliceTransfer_v4
dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
printf("v4: %f %d\n", type_convert<float>(dst_buf[Number<dst_offset>{}]), threadIdx.x);
});
}
});
......
......@@ -554,9 +554,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
dst_vector_container.template AsType<DstData>()(i) = dst_v;
//if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
//printf("v3r1: %f %d\n", type_convert<float>(dst_v), threadIdx.x);
});
// copy data from dst_vector_container to dst_buf
......
......@@ -157,16 +157,6 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
template <class FloatC>
__device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
{
auto tmp_a = vector_type<half_t, 4>{reg_a};
auto tmp_b = vector_type<half_t, 4>{reg_b};
printf("{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d\n",
static_cast<float>(tmp_a.template AsType<half_t>()(Number<0>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<0>{})),
static_cast<float>(tmp_a.template AsType<half_t>()(Number<1>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<1>{})),
static_cast<float>(tmp_a.template AsType<half_t>()(Number<2>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<2>{})),
static_cast<float>(tmp_a.template AsType<half_t>()(Number<3>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<3>{})),
threadIdx.x, blockIdx.x
);
reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment