Commit 966f9051 authored by mtgu0705's avatar mtgu0705
Browse files

fixed merge issue. fp8xint4 and fp8xint4_bpreshuffle function pass.

parent 49bac8ce
...@@ -9,15 +9,6 @@ ...@@ -9,15 +9,6 @@
namespace ck { namespace ck {
enum struct BlockGemmPipelineVersion
{
v1, // Naive
v2, // Mem
v3, // Comp
v4, // Comp, double lds buffer
v5, // Comp, double global prefetch register buffer
};
template <BlockGemmPipelineVersion BlkGemmPipelineVer, template <BlockGemmPipelineVersion BlkGemmPipelineVer,
BlockGemmPipelineScheduler BlkGemmPipeSche, BlockGemmPipelineScheduler BlkGemmPipeSche,
index_t BlockSize, index_t BlockSize,
......
...@@ -113,17 +113,6 @@ struct BlockwiseGemmXdlops_pipeline_base ...@@ -113,17 +113,6 @@ struct BlockwiseGemmXdlops_pipeline_base
return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]); return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
} }
__device__ static auto CalculateAThreadOriginDataIndex6D()
{
const auto wave_idx = GetWaveIdx();
const auto waveId_m = wave_idx[I0];
const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
return make_tuple(0, waveId_m, xdlops_a_idx[I1], 0, xdlops_a_idx[I0], 0);
}
__device__ static auto CalculateAThreadOriginDataIndex6D() __device__ static auto CalculateAThreadOriginDataIndex6D()
{ {
......
...@@ -286,118 +286,49 @@ struct ThreadwiseTensorSliceTransfer_v2 ...@@ -286,118 +286,49 @@ struct ThreadwiseTensorSliceTransfer_v2
// loop over tensor and copy // loop over tensor and copy
constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
static_for<0, num_access, 1>{}([&](auto idx_1d) {
typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type src_vector;
#if 0 using src_vector_t =
if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value) typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type::type;
{ constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
static_for<0, num_access, 1>{}([&](auto idx_1d) {
typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type src_tmp_vector;
using src_vector_t = typename decltype(src_tmp_vector)::type;
constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
const bool is_src_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
// copy data from src_buf into src_tmp_vector
src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize,
is_src_valid);
// copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
// DstData)
vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
constexpr index_t pack_size = 8;
static_assert(SrcScalarPerVector % pack_size == 0, "");
using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) { const bool is_src_valid =
ck::tensor_operation::element_wise::PassThroughPack8{}( coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
dst_tmp_vector.template AsType<dst_v_t>()(i),
src_tmp_vector.template AsType<src_v_t>()[i]);
});
// copy data from dst_tmp_vector into dst_buf // copy data from src_buf into src_vector
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { src_vector.template AsType<src_vector_t>()(Number<0>{}) =
constexpr index_t dst_offset = src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize, is_src_valid);
dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
i * src_scalar_step_in_vector);
if constexpr(InvalidElementAsNaN) // copy data from src_vector into dst_buf
{ static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
dst_buf(Number<dst_offset>{}) = constexpr index_t dst_offset =
is_src_valid dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
? dst_tmp_vector.template AsType<DstData>()[i] i * src_scalar_step_in_vector);
: NumericLimits<DstData>::QuietNaN();
}
else
{
dst_buf(Number<dst_offset>{}) =
dst_tmp_vector.template AsType<DstData>()[i];
// type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
}
});
if constexpr(idx_1d.value != num_access - 1) if constexpr(InvalidElementAsNaN)
{ {
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); dst_buf(Number<dst_offset>{}) =
is_src_valid
move_tensor_coordinate( ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step)); : NumericLimits<DstData>::QuietNaN();
} }
}); else
}
else
#endif
{
static_for<0, num_access, 1>{}([&](auto idx_1d) {
typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type src_vector;
using src_vector_t =
typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type::type;
constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
const bool is_src_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
// copy data from src_buf into src_vector
src_vector.template AsType<src_vector_t>()(Number<0>{}) =
src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize, is_src_valid);
// copy data from src_vector into dst_buf
static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
constexpr index_t dst_offset =
dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
i * src_scalar_step_in_vector);
if constexpr(InvalidElementAsNaN)
{
dst_buf(Number<dst_offset>{}) =
is_src_valid
? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
: NumericLimits<DstData>::QuietNaN();
}
else
{
dst_buf(Number<dst_offset>{}) =
type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
}
});
if constexpr(idx_1d.value != num_access - 1)
{ {
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); dst_buf(Number<dst_offset>{}) =
type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
move_tensor_coordinate(
src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
} }
}); });
}
if constexpr(idx_1d.value != num_access - 1)
{
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
move_tensor_coordinate(
src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
}
});
// move src coordinate back to slice origin (or not) // move src coordinate back to slice origin (or not)
if constexpr(SrcResetCoordinateAfterRun) if constexpr(SrcResetCoordinateAfterRun)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment