Commit c3d05c0c authored by Jing Zhang's avatar Jing Zhang
Browse files

debug

parent 3ef4d2c2
......@@ -8,38 +8,13 @@
using ADataType = ck::half_t;
using BDataType = ck::pk_i4_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using CShuffleDataType = float;
using CDataType = ck::half_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
inline __host__ __device__ ck::half2_t
type_convert_packed_i4_to_half2(ck::pk_i4_t x)
{
uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
uint8_t x_l = (x_u8 & 0x0f);
uint8_t x_h = (x_u8 & 0xf0) >> 4;
auto l_f16 = ck::type_convert<ck::half_t>(x_l);
auto h_f16 = ck::type_convert<ck::half_t>(x_h);
return {l_f16, h_f16};
}
struct ElementwisePackedI4ToHalf2
{
__host__ __device__ void
operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
{
y = type_convert_packed_i4_to_half2(x);
}
constexpr const static bool is_pack2_invocable = true;
};
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
......
......@@ -133,7 +133,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
};
StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
StrideB = f_get_default_stride(K, N, StrideB / 2, BLayout{});
StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
......@@ -143,7 +143,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
case 0:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{0x11});
break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
......@@ -228,16 +228,15 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
}
bool pass = true;
#if 0
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
//auto ref_gemm = ReferenceGemmInstance{};
//auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
//auto ref_argument = ref_gemm.MakeArgument(
// a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument);
//ref_invoker.Run(ref_argument);
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
#ifdef BUILD_INT4_EXAMPLE
......@@ -251,14 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
//pass &= ck::utils::check_err(c_m_n_device_result,
// c_m_n_host_result,
// "Error: Incorrect results!",
// get_rtol<CDataType>(),
// get_atol<CDataType>());
LogRangeAsType<float>(std::cout << "c_m_n_device_buf : ", c_m_n_device_result.mData, ",") << std::endl;
#endif
}
#endif
if(config.time_kernel)
{
......@@ -267,7 +268,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N / 2 + sizeof(CDataType) * M * N;
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N / (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
......
......@@ -25,7 +25,7 @@ struct PassThroughPack2
__host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
{
#if 0
#if 1
uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
uint8_t x_l = (x_u8 & 0x0f) >> 0;
uint8_t x_h = (x_u8 & 0xf0) >> 4;
......
......@@ -924,6 +924,13 @@ struct GridwiseGemm_xdl_cshuffle_v3
NXdlPerWave,
KPack>())>;
static constexpr index_t APackedSize = []() {
if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
return 2;
else
return 1;
}();
static constexpr index_t BPackedSize = []() {
if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
return 2;
......@@ -941,10 +948,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize, max_lds_align);
constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align) / BPackedSize;
b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize, max_lds_align);
// LDS allocation for C shuffle in LDS
constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
......@@ -1312,14 +1319,14 @@ struct GridwiseGemm_xdl_cshuffle_v3
// LDS allocation for A and B: be careful of alignment
constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize, max_lds_align);
// Cast after lds
auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<BDataType*>(static_cast<unsigned char *>(p_shared) +
bit_cast<BDataType*>(bit_cast<unsigned char *>(p_shared) +
a_block_space_size_aligned * sizeof(ADataType)),
b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
......@@ -1707,10 +1714,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
// LDS allocation for A and B: be careful of alignment
constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize, max_lds_align);
auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<BDataType*>(static_cast<char*>(p_shared_0) +
......@@ -1718,10 +1725,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<BDataType*>(static_cast<char*>(p_shared_1) +
bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
a_block_space_size_aligned * sizeof(ADataType)),
b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
......
......@@ -1149,9 +1149,11 @@ struct ThreadwiseTensorSliceTransfer_v4
// DstData)
vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
using dst_v_t = typename vector_type_maker_t<DstData, PackedSize>::type;
constexpr index_t pack_size = PackedSize;
using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
using src_v_t = typename vector_type_maker_t<SrcData, 1>::type;
static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
ck::tensor_operation::element_wise::PassThroughPack2{}(
dst_tmp_vector.template AsType<dst_v_t>()(i),
src_tmp_vector.template AsType<src_v_t>()[i]);
......@@ -1209,6 +1211,10 @@ struct ThreadwiseTensorSliceTransfer_v4
dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
printf("v4: %f %d\n", type_convert<float>(dst_buf[Number<dst_offset>{}]), threadIdx.x);
});
}
});
......
......@@ -193,9 +193,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
using src_vector_t = typename src_vector_type::type;
auto src_vector_container =
src_vector_type{src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize, true)};
using dst_vector_type = vector_type_maker_t<DstData, SrcScalarPerVector>;
using dst_vector_t = typename dst_vector_type::type;
dst_vector_type op_r_v;
......@@ -229,6 +226,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static_assert(elem_op_vec_len == 1, "elem_op_vec_len != 1");
auto src_vector_container =
src_vector_type{src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize, true)};
static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto idx) {
// apply the src elementwise op and convert to DstData under the hood if needed
src_element_op_(op_r_v.template AsType<dst_elem_op_vec_t>()(idx),
......@@ -554,6 +554,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
dst_vector_container.template AsType<DstData>()(i) = dst_v;
//if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
//printf("v3r1: %f %d\n", type_convert<float>(dst_v), threadIdx.x);
});
// copy data from dst_vector_container to dst_buf
......
......@@ -157,11 +157,18 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
template <class FloatC>
__device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
{
ignore = reg_a;
ignore = reg_b;
ignore = reg_c;
//reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
//reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
auto tmp_a = vector_type<half_t, 4>{reg_a};
auto tmp_b = vector_type<half_t, 4>{reg_b};
printf("{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d\n",
static_cast<float>(tmp_a.template AsType<half_t>()(Number<0>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<0>{})),
static_cast<float>(tmp_a.template AsType<half_t>()(Number<1>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<1>{})),
static_cast<float>(tmp_a.template AsType<half_t>()(Number<2>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<2>{})),
static_cast<float>(tmp_a.template AsType<half_t>()(Number<3>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<3>{})),
threadIdx.x, blockIdx.x
);
reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment