Commit 1f9546e0 authored by root's avatar root
Browse files

Merge branch 'develop' into gemm_bf16_sk_muozturk

parents 78394194 86990558
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
// #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp"
namespace ck_tile {
/* independent host side argument, no template
*/
struct GenericPermuteHostArgs
{
static constexpr index_t kMaxRanks = 8; // TODO: hardcoded
const void* p_src;
void* p_dst;
index_t rank;
index_t shape[kMaxRanks]; // input shape
index_t perm[kMaxRanks]; // permute index
};
/*
simulate torch.permute:
x_ = x_.view(x.shape[0],
x.shape[1]//16, 16,
x.shape[2]//32, 4, 8)
x_ = x_.permute(0,1,3,4,2,5)
x_ = x_.contiguous()
x_ = x_.view(x.shape[0], x.shape[1], x.shape[2]);//
this kernel is supposed not to be performant(just OK), with functional support up to kMaxRanks
dim of permutation, with a single kernel
*/
template <typename Problem_>
struct GenericPermute
{
using Problem = ck_tile::remove_cvref_t<Problem_>;
using DataType = remove_cvref_t<typename Problem::DataType>;
static constexpr index_t kBlockSize = Problem::kBlockSize;
static constexpr index_t kMaxRanks = Problem::kMaxRanks;
static constexpr bool KeepLastDim = Problem::KeepLastDim;
struct __attribute__((packed)) Kargs
{
const void* p_src;
void* p_dst;
// index_t rank;
index_t num_elements;
index_t perm_length[kMaxRanks]; // tensor length after permutation
index_t perm_stride[kMaxRanks]; // tensor stride after permutation
};
CK_TILE_HOST static constexpr index_t TotalElements(const GenericPermuteHostArgs& h)
{
index_t n = 1;
for(auto i = 0; i < h.rank; i++)
{
n *= h.shape[i];
}
return n;
}
CK_TILE_HOST static constexpr Kargs MakeKargs(const GenericPermuteHostArgs& h)
{
Kargs a;
a.p_src = h.p_src;
a.p_dst = h.p_dst;
// assert rank <= kMaxRanks
index_t i = 0;
index_t perm[kMaxRanks];
index_t x_shape[kMaxRanks];
index_t x_stride[kMaxRanks];
// index_t perm_length[kMaxRanks];
for(; i < h.rank; i++)
{
x_shape[i] = h.shape[i];
perm[i] = h.perm[i];
}
for(; i < kMaxRanks; i++)
{
x_shape[i] = 1;
perm[i] = i; // will index to len = 1
}
index_t stride = 1;
for(index_t j = kMaxRanks - 1; j >= 0; j--)
{
x_stride[j] = stride;
stride *= x_shape[j];
}
for(index_t j = 0; j < kMaxRanks; j++)
{
a.perm_length[j] = x_shape[perm[j]];
a.perm_stride[j] = x_stride[perm[j]];
}
a.num_elements = TotalElements(h);
return a;
}
CK_TILE_HOST static constexpr auto GridSize(GenericPermuteHostArgs h)
{
auto total = TotalElements(h);
auto grids = dim3((total + BlockSize() - 1) / BlockSize());
// printf("### total:%d, grids:%dx%dx%d\n", total, );
return grids;
}
CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::kBlockSize; }
CK_TILE_DEVICE void operator()(Kargs kargs) const
{
index_t id = blockIdx.x * BlockSize() + threadIdx.x;
if(id >= kargs.num_elements)
return;
const auto perm_length =
generate_tuple([&](auto I) { return kargs.perm_length[I]; }, number<kMaxRanks>{});
const auto perm_stride =
generate_tuple([&](auto I) { return kargs.perm_stride[I]; }, number<kMaxRanks>{});
const DataType* p_src = reinterpret_cast<const DataType*>(kargs.p_src);
DataType* p_dst = reinterpret_cast<DataType*>(kargs.p_dst);
const auto src_view_0 = make_naive_tensor_view<address_space_enum::global>(
p_src, perm_length, perm_stride, number<1>{}, number<1>{});
const auto src_view = transform_tensor_view(
src_view_0,
make_tuple(make_merge_transform(perm_length)),
make_tuple(typename arithmetic_sequence_gen<0, kMaxRanks, 1>::type{}),
make_tuple(sequence<0>{}));
auto dst_view_0 = make_naive_tensor_view_packed<address_space_enum::global>(
p_dst, perm_length, number<1>{});
auto dst_view = transform_tensor_view(
dst_view_0,
make_tuple(make_merge_transform(perm_length)),
make_tuple(typename arithmetic_sequence_gen<0, kMaxRanks, 1>::type{}),
make_tuple(sequence<0>{}));
// TODO: hard code to vector 1
using vector_t = thread_buffer<DataType, 1>;
const auto src_coord =
make_tensor_coordinate(src_view.get_tensor_descriptor(), array<index_t, 1>{id});
const auto dst_coord =
make_tensor_coordinate(dst_view.get_tensor_descriptor(), array<index_t, 1>{id});
// printf("src id:%d, os:%d\n", id, src_coord.get_offset());
// printf("dst id:%d, os:%d\n", id, dst_coord.get_offset());
const vector_t x = src_view.template get_vectorized_elements<vector_t>(src_coord, 0);
dst_view.template set_vectorized_elements<vector_t>(dst_coord, 0, x);
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/utility/type_traits.hpp"
namespace ck_tile {
template <typename DataType_,
index_t kBlockSize_ = 256,
index_t kMaxRanks_ = 8,
bool KeepLastDim_ = false>
struct GenericPermuteProblem
{
using DataType = remove_cvref_t<DataType_>;
static constexpr index_t kBlockSize = kBlockSize_;
static constexpr index_t kMaxRanks = kMaxRanks_;
/* KeepLastDim:
* if last dim keep the same? this can help enable vector load
* permute(0, 2, 4, 1, 3, 5) -> true
* permute(0, 3, 2, 1) -> false
*/
static constexpr bool KeepLastDim = KeepLastDim_;
// TODO: not used(?)
};
} // namespace ck_tile
......@@ -4,4 +4,8 @@
#pragma once
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include <tuple>
// This file is not support cross warp reduce
namespace ck_tile {
/*
* TODO: block_tile_reduce_sync() currently has a limitation
* Y dim must have at least one dim not been reduced
*/
// synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
......@@ -22,7 +28,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
constexpr index_t idim_p_lane = NDimP - 1;
const auto ps_idx = make_array<index_t>(get_block_id(), get_lane_id());
const auto ps_idx = detail::get_partition_index(acc_tensor.get_tile_distribution());
const auto rs_idx = acc_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size();
......@@ -104,6 +110,65 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
});
}
/*
* this version is faster, using xor to do reduce, no need broadcast anymore
* TODO: the limitation is to-be-reduced P dim can only mapping to one R dim?
*/
template <typename AccDistributedTensor_, typename ReduceFunc>
CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor,
const ReduceFunc& reduce_func)
{
using Dstr = typename AccDistributedTensor_::StaticTileDistribution;
using DstrEncode = typename Dstr::DstrEncode;
using DstrEncodeDetail = typename DstrEncode::detail;
constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
constexpr index_t idim_p_lane = NDimP - 1;
constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size();
// loop over thread data
static_for<0, thread_buf_size, 1>{}([&](auto i) {
auto v_local = acc_tensor.get_thread_buffer()[i];
// cross-lane reduce for replication
// only reduce on R dimension correspond to lane
// (lane id maps to this R dimension)
static_for<0, NDimR, 1>{}([&](auto idim_r) {
// FIXME: nasty to use does_p_own_r_
if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
{
constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
constexpr index_t lid_over_rid_derivative =
DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
static_assert(is_power_of_two_integer(r_length),
"wrong! only support power of 2 reduction");
constexpr index_t nstage = integer_log2_floor(r_length);
// reduction sweep forward
static_for<0, nstage, 1>{}([&](auto istage) {
// xor
index_t src_lane =
__lane_id() ^ (number<lid_over_rid_derivative << istage.value>{}.value);
// pull data from remote lane
const auto v_remote = warp_shuffle(v_local, src_lane);
// reduce
v_local = reduce_func(v_local, v_remote);
});
}
});
acc_tensor.get_thread_buffer()(i) = v_local;
});
}
// FIXME: this is for 2D to 1D reduce only, need to support n-D
template <typename AccDistributedTensor_,
typename InDistributedTensor_,
......@@ -175,6 +240,10 @@ CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor,
#endif
}
/*
* TODO: block_tile_reduce() currently has a limitation
* Y dim must have at least one dim not been reduced
*/
template <typename AccDataType_,
typename InDistributedTensor_,
index_t... InReduceDims,
......@@ -208,4 +277,109 @@ CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
return acc_tensor;
}
// this version only support 2D->1D reduce (reduce-dim=seq<0, 1>)
// this version only support in/acc/out datatypes are the same
// this version will call thread/warp+sync in one function call
//
template <typename InDistributedTensor_>
struct BlockReduce2D
{
using InDistributedTensor = remove_cvref_t<InDistributedTensor_>;
using InDataType = typename InDistributedTensor::DataType;
CK_TILE_HOST_DEVICE BlockReduce2D(const InDistributedTensor& t_, const InDataType& reduce_init_)
: t(t_), reduce_init(reduce_init_)
{
}
CK_TILE_HOST_DEVICE constexpr auto MakeDstBlockTile() const
{
using ReduceDim = sequence<1>; // hard coded
constexpr auto acc_dstr =
make_static_tile_distribution(ck_tile::detail::make_reduce_tile_distribution_encoding(
InDistributedTensor::get_tile_distribution()
.get_static_tile_distribution_encoding(),
ReduceDim{}));
auto dst_ = make_static_distributed_tensor<InDataType>(acc_dstr);
// init acc_tensor
tile_elementwise_inout([&](auto& x_) { x_ = type_convert<InDataType>(reduce_init); }, dst_);
return dst_;
}
// return number of pixels each lane need to reduce
CK_TILE_HOST_DEVICE constexpr auto get_reduce_length_y() const
{
constexpr auto spans = InDistributedTensor::get_distributed_spans();
}
// Here ReducePacksPerXDim is not the same meaning as that in static_uford/sweep_tile_uspan
// this is number of packs along the X-dim. We need to compute the Unpacks along the Y dim
// internally
// For simplicity, we just support along the row dimension, ReducePacksPerXDim is always 2
// element , and the first element is always ignored For simplicity, will always try from
// right-to-left to find alone which Y dim to split
template <typename ReduceFunc,
typename ReduceSyncFunc,
typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func,
const ReduceSyncFunc& reduce_sync_func,
ReducePacksPerXDim = {}) const
{
constexpr auto spans = InDistributedTensor::get_distributed_spans();
constexpr auto row_y_unpacks = [&]() {
constexpr auto row_y_lengths = typename decltype(spans[number<1>{}])::Impl{};
constexpr auto row_y_size =
reduce_on_sequence(row_y_lengths, multiplies{}, number<1>{});
constexpr auto row_y_packs = ReducePacksPerXDim{}.at(number<1>{});
static_assert(row_y_size % row_y_packs == 0);
constexpr auto row_y_slice_size = row_y_size / row_y_packs;
constexpr auto slice_info = slice_sequence(row_y_lengths, number<row_y_slice_size>{});
constexpr auto unpacks = slice_info[number<1>{}];
return unpacks;
}();
auto acc_tensor = MakeDstBlockTile();
// in-thread reduction
// FIXME: hard coded to be 2D to 1D reduction
sweep_tile_span(spans[number<0>{}], [&](auto dstr_idx_i0) {
constexpr auto acc_dstr_idx = make_tuple(dstr_idx_i0);
auto acc = acc_tensor[acc_dstr_idx];
sweep_tile_uspan(
spans[number<1>{}],
[&](auto... dstr_idx_i1) {
acc = reduce_func(acc, t[make_tuple(dstr_idx_i0, dstr_idx_i1)]...);
},
row_y_unpacks);
acc_tensor(acc_dstr_idx) = acc;
});
// TODO: always use xor to do cross-lane reduce
block_tile_reduce_xor_sync(acc_tensor, reduce_sync_func);
return acc_tensor;
}
template <typename ReduceFunc>
CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func) const
{
return operator()(reduce_func, reduce_func);
}
InDistributedTensor t;
InDataType reduce_init;
};
// deduction guide
template <typename T>
CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D(const T&, const typename T::DataType&)->BlockReduce2D<T>;
} // namespace ck_tile
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment