Commit 5a9c4962 authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge remote-tracking branch 'origin/develop' into aosewski/ggemm_multi_d2

parents 3970cf73 43879b89
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename ADataType, typename AccDataType, typename BDataType>
CK_TILE_HOST void reference_softmax(const HostTensor<ADataType>& a_m_n,
HostTensor<BDataType>& b_m_n)
{
auto f = [&](auto m) {
const int N = a_m_n.mDesc.get_lengths()[1];
AccDataType v_max = ck_tile::numeric<ADataType>::Lowest();
// max
for(int n = 0; n < N; ++n)
{
const ADataType v_a = a_m_n(m, n);
v_max = v_max < v_a ? v_a : v_max;
}
AccDataType v_exp_sum = 0;
// sum
for(int n = 0; n < N; ++n)
{
const ADataType v_a = a_m_n(m, n);
v_exp_sum += ck_tile::exp(v_a - v_max);
}
// elementwise
for(int n = 0; n < N; ++n)
{
const ADataType v_a = a_m_n(m, n);
b_m_n(m, n) = ck_tile::exp(v_a - v_max) / v_exp_sum;
}
};
make_ParallelTensorFunctor(f,
b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
namespace ck_tile {
struct stream_config
{
hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false;
int log_level_ = 0;
int cold_niters_ = 3;
int nrepeat_ = 10;
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/common/tensor_layout.hpp"
## common
this folder is designed not to be included directly by use, e.g. if use include `ck_tile/ops/fmha.hpp`, then everything under `common` should also be included.
to achieve this we will duplicate the header include path under `common` to other module under `ops/*` inside remod.py. for internal developer, you can also include `ck_tile/ops/common.hpp` for convenience. (and so does external users...)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
// TODO: this folder does not match the single namespace rule. need to refactor in the future
namespace ck_tile {
namespace tensor_layout {
struct BaseTensorLayout
{
};
namespace gemm {
struct RowMajor : public BaseTensorLayout
{
static constexpr const char* name = "RowMajor";
};
struct ColumnMajor : public BaseTensorLayout
{
static constexpr const char* name = "ColumnMajor";
};
} // namespace gemm
namespace convolution {
// input tensor
// packed NCW/NCHW/NCDHW
struct NCW : public BaseTensorLayout
{
static constexpr const char* name = "NCW";
};
struct NCHW : public BaseTensorLayout
{
static constexpr const char* name = "NCHW";
};
struct NCDHW : public BaseTensorLayout
{
static constexpr const char* name = "NCDHW";
};
// packed GNCW/GNCHW/GNCDHW
struct GNCW : public BaseTensorLayout
{
static constexpr const char* name = "GNCW";
};
struct GNCHW : public BaseTensorLayout
{
static constexpr const char* name = "GNCHW";
};
struct GNCDHW : public BaseTensorLayout
{
static constexpr const char* name = "GNCDHW";
};
// input tensor
// packed NWC/NHWC/NDHWC
struct NWC : public BaseTensorLayout
{
static constexpr const char* name = "NWC";
};
struct NHWC : public BaseTensorLayout
{
static constexpr const char* name = "NHWC";
};
struct NDHWC : public BaseTensorLayout
{
static constexpr const char* name = "NDHWC";
};
// input tensor
// packed GNWC/GNHWC/GNDHWC
struct GNWC : public BaseTensorLayout
{
static constexpr const char* name = "GNWC";
};
struct GNHWC : public BaseTensorLayout
{
static constexpr const char* name = "GNHWC";
};
struct GNDHWC : public BaseTensorLayout
{
static constexpr const char* name = "GNDHWC";
};
// for input bias
struct GC : public BaseTensorLayout
{
static constexpr const char* name = "GC";
};
// input tensor
// packed NWGC/NHWGC/NDHWGC
struct NWGC : public BaseTensorLayout
{
static constexpr const char* name = "NWGC";
};
struct NHWGC : public BaseTensorLayout
{
static constexpr const char* name = "NHWGC";
};
struct NDHWGC : public BaseTensorLayout
{
static constexpr const char* name = "NDHWGC";
};
// input tensor
// strided layout
struct G_NW_C : public BaseTensorLayout
{
static constexpr const char* name = "G_NW_C";
};
struct G_NHW_C : public BaseTensorLayout
{
static constexpr const char* name = "G_NHW_C";
};
struct G_NDHW_C : public BaseTensorLayout
{
static constexpr const char* name = "G_NDHW_C";
};
// for input bias
struct G_C : public BaseTensorLayout
{
static constexpr const char* name = "G_C";
};
// weight tensor
// packed KCX/KCYX/KCZYX
struct KCX : public BaseTensorLayout
{
static constexpr const char* name = "KCX";
};
struct KCYX : public BaseTensorLayout
{
static constexpr const char* name = "KCYX";
};
struct KCZYX : public BaseTensorLayout
{
static constexpr const char* name = "KCZYX";
};
// weight tensor
// packed KCX/KCYX/KCZYX
struct GKCX : public BaseTensorLayout
{
static constexpr const char* name = "GKCX";
};
struct GKCYX : public BaseTensorLayout
{
static constexpr const char* name = "GKCYX";
};
struct GKCZYX : public BaseTensorLayout
{
static constexpr const char* name = "GKCZYX";
};
// weight tensor
// packed KXC/KYXC/KZYXC
struct KXC : public BaseTensorLayout
{
static constexpr const char* name = "KXC";
};
struct KYXC : public BaseTensorLayout
{
static constexpr const char* name = "KYXC";
};
struct KZYXC : public BaseTensorLayout
{
static constexpr const char* name = "KZYXC";
};
// weight tensor
// packed GKXC/GKYXC/GKZYXC
struct GKXC : public BaseTensorLayout
{
static constexpr const char* name = "GKXC";
};
struct GKYXC : public BaseTensorLayout
{
static constexpr const char* name = "GKYXC";
};
struct GKZYXC : public BaseTensorLayout
{
static constexpr const char* name = "GKZYXC";
};
// weight tensor
// packed KXGC/KYXGC/KZYXGC
struct KXGC : public BaseTensorLayout
{
static constexpr const char* name = "KXGC";
};
struct KYXGC : public BaseTensorLayout
{
static constexpr const char* name = "KYXGC";
};
struct KZYXGC : public BaseTensorLayout
{
static constexpr const char* name = "KZYXGC";
};
// weight tensor
// strided
struct G_K_X_C : public BaseTensorLayout
{
static constexpr const char* name = "G_K_X_C";
};
struct G_K_YX_C : public BaseTensorLayout
{
static constexpr const char* name = "G_K_YX_C";
};
struct G_K_ZYX_C : public BaseTensorLayout
{
static constexpr const char* name = "G_K_ZYX_C";
};
// output tensor
// packed NKW/NKHW/NKDHW
struct NKW : public BaseTensorLayout
{
static constexpr const char* name = "NKW";
};
struct NKHW : public BaseTensorLayout
{
static constexpr const char* name = "NKHW";
};
struct NKDHW : public BaseTensorLayout
{
static constexpr const char* name = "NKDHW";
};
// output tensor
// packed GNKW/GNKHW/GNKDHW
struct GNKW : public BaseTensorLayout
{
static constexpr const char* name = "GNKW";
};
struct GNKHW : public BaseTensorLayout
{
static constexpr const char* name = "GNKHW";
};
struct GNKDHW : public BaseTensorLayout
{
static constexpr const char* name = "GNKDHW";
};
// output tensor
// packed NWK/NHWK/NDHWK
struct NWK : public BaseTensorLayout
{
static constexpr const char* name = "NWK";
};
struct NHWK : public BaseTensorLayout
{
static constexpr const char* name = "NHWK";
};
struct NDHWK : public BaseTensorLayout
{
static constexpr const char* name = "NDHWK";
};
// output tensor
// packed GNWK/GNHWK/GNDHWK
struct GNWK : public BaseTensorLayout
{
static constexpr const char* name = "GNWK";
};
struct GNHWK : public BaseTensorLayout
{
static constexpr const char* name = "GNHWK";
};
struct GNDHWK : public BaseTensorLayout
{
static constexpr const char* name = "GNDHWK";
};
// output tensor
// packed NWGK/NHWGK/NDHWGK
struct NWGK : public BaseTensorLayout
{
static constexpr const char* name = "NWGK";
};
struct NHWGK : public BaseTensorLayout
{
static constexpr const char* name = "NHWGK";
};
struct NDHWGK : public BaseTensorLayout
{
static constexpr const char* name = "NDHWGK";
};
// output tensor
// strided layout
struct G_NW_K : public BaseTensorLayout
{
static constexpr const char* name = "G_NW_K";
};
struct G_NHW_K : public BaseTensorLayout
{
static constexpr const char* name = "G_NHW_K";
};
struct G_NDHW_K : public BaseTensorLayout
{
static constexpr const char* name = "G_NDHW_K";
};
// for output bias
struct G_K : public BaseTensorLayout
{
static constexpr const char* name = "G_K";
};
// K-reduced output tensor (packed)
struct GNW : public BaseTensorLayout
{
static constexpr const char* name = "GNW";
};
struct GNHW : public BaseTensorLayout
{
static constexpr const char* name = "GNHW";
};
struct GNDHW : public BaseTensorLayout
{
static constexpr const char* name = "GNDHW";
};
// K-reduced output tensor (packed)
struct NWG : public BaseTensorLayout
{
static constexpr const char* name = "NWG";
};
struct NHWG : public BaseTensorLayout
{
static constexpr const char* name = "NHWG";
};
struct NDHWG : public BaseTensorLayout
{
static constexpr const char* name = "NDHWG";
};
// K-reduced output tensor (strided)
struct G_NW : public BaseTensorLayout
{
static constexpr const char* name = "G_NW";
};
struct G_NHW : public BaseTensorLayout
{
static constexpr const char* name = "G_NHW";
};
struct G_NDHW : public BaseTensorLayout
{
static constexpr const char* name = "G_NDHW";
};
} // namespace convolution
template <
typename Layout,
typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
std::ostream& operator<<(std::ostream& os, const Layout&)
{
os << Layout::name;
return os;
}
} // namespace tensor_layout
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
// this epilogue just store out a M*N matrix, row major
template <typename AccDataType_, typename ODataType_, bool kPadM_, bool kPadN_>
struct Default2DEpilogueProblem
{
using AccDataType = remove_cvref_t<AccDataType_>;
using ODataType = remove_cvref_t<ODataType_>;
static constexpr bool kPadM = kPadM_;
static constexpr bool kPadN = kPadN_;
};
template <typename Problem_, typename Policy_ = void>
struct Default2DEpilogue
{
using Problem = remove_cvref_t<Problem_>;
using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
using ODataType = remove_cvref_t<typename Problem::ODataType>;
static constexpr bool kPadM = Problem::kPadM;
static constexpr bool kPadN = Problem::kPadN;
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }
// TODO: this function assume store out vector size is the same as OAccTile last dimension size
// how do we fix this ?
template <typename ODramWindowTmp, typename OAccTile>
CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile)
{
// TODO: this is ugly
if constexpr(kPadM || kPadN)
{
store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
buffer_store_fence();
}
else
{
store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
}
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/fmha/block/block_masking.hpp"
#include "ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp"
#include "ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
#include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
enum struct GenericAttentionMaskEnum
{
NO_MASK = 0,
// below enum could be causal, or sliding window
MASK_FROM_TOP_LEFT = 1,
MASK_FROM_BOTTOM_RIGHT = 2,
// this enum maybe not used by xformer/FA, since it's hard to
// specify left/right window for varlen case. put it here for
// debug purpose
MASK_GENERIC,
};
// clang-format off
/* generic Attention Mask Coordinate
use x(horizontal axis), y(vertical axis) to describe mask.
top-left corner is origin
x=1/y=5(top-left) x=4/y=5(botm-r) x=6/y=5 x=8/y=5(no mask)
1 * * * * * * * 1 1 1 1 * * * * 1 1 1 1 1 1 * * 1 1 1 1 1 1 1 1
1 1 * * * * * * 1 1 1 1 1 * * * 1 1 1 1 1 1 1 * 1 1 1 1 1 1 1 1
1 1 1 * * * * * 1 1 1 1 1 1 * * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 * * * * 1 1 1 1 1 1 1 * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 * * * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
l=7,-1/r=0(tl) l=7,-1/r=0(br)
x=1/y=2 x=4/y=2 x=6/y=2 x=8/y=2
1 * * * * * * * 1 1 1 1 * * * * 1 1 1 1 1 1 * * 1 1 1 1 1 1 1 1
1 1 * * * * * * 1 1 1 1 1 * * * 1 1 1 1 1 1 1 * 1 1 1 1 1 1 1 1
* 1 1 * * * * * * 1 1 1 1 1 * * * 1 1 1 1 1 1 1 * 1 1 1 1 1 1 1
* * 1 1 * * * * * * 1 1 1 1 1 * * * 1 1 1 1 1 1 * * 1 1 1 1 1 1
* * * 1 1 * * * * * * 1 1 1 1 1 * * * 1 1 1 1 1 * * * 1 1 1 1 1
l=1/r=0(tl) l=1/r=3(tl) l=1/r=5(tl) l=1/r=7(tl)
l=4/r=0(br) l=4/r=2(br) l=4/r=4(br)
x=4/y=-1 x=6/y=-1 x=8/y=-1
* * 1 1 * * * * * * 1 1 1 1 * * * * 1 1 1 1 1 1
* * * 1 1 * * * * * * 1 1 1 1 * * * * 1 1 1 1 1
* * * * 1 1 * * * * * * 1 1 1 1 * * * * 1 1 1 1
* * * * * 1 1 * * * * * * 1 1 1 * * * * * 1 1 1
* * * * * * 1 1 * * * * * * 1 1 * * * * * * 1 1
x=-2/y=5 x=1/y=5(top-left) x=0/y=5(botm-r)
* * * * * * * * 1 * * * * * * *
* * * * * * * * 1 1 * * 1 * * *
* * * * * * * * 1 1 1 * 1 1 * *
1 * * * * * * * 1 1 1 1 1 1 1 *
1 1 * * * * * * 1 1 1 1 1 1 1 1
Validations:
x + y > 1 (x + y >= 2)
Note:
y = seq_q, x = 1 -> top-left
y = seq_q, x = seq_k - seq_q + 1 -> bottom-right
y < seq_q, x < seq_k -> local-attn
y = seq_q, x = seq_k -> no mask
*/
namespace impl {
template <bool IsMasking_, bool IsLocal_> struct MaskName;
template<> struct MaskName<false, false> { static constexpr const char * name = "mn"; };
template<> struct MaskName<false, true> { static constexpr const char * name = "mn"; };
template<> struct MaskName<true, false> { static constexpr const char * name = "mc"; };
template<> struct MaskName<true, true> { static constexpr const char * name = "mg"; };
}
// clang-format on
template <bool IsMasking_ = true, bool IsLocal_ = false>
struct GenericAttentionMask
{
static constexpr bool IsMasking = IsMasking_; // false will disable masking
static constexpr bool IsLocal = IsLocal_; // if true, upper/lower area could have mask,
// else only upper-right could have mask
static constexpr const char* name = impl::MaskName<IsMasking, IsLocal>::name;
CK_TILE_HOST_DEVICE GenericAttentionMask(index_t y_total_, index_t x_total_)
: GenericAttentionMask(0, 0, y_total_, x_total_)
{
}
CK_TILE_HOST_DEVICE
GenericAttentionMask(index_t y_, index_t x_, index_t y_total_, index_t x_total_)
: y(y_), x(x_), y_total(y_total_), x_total(x_total_)
{
}
template <typename MaskCoordinates>
CK_TILE_HOST_DEVICE GenericAttentionMask(const MaskCoordinates& mask_coord)
: y(mask_coord.at(number<0>{})),
x(mask_coord.at(number<1>{})),
y_total(mask_coord.at(number<2>{})),
x_total(mask_coord.at(number<3>{}))
{
}
// to get the loop length along X axis, return index:[start, end), end-start=length
// use this if need loop over X axis tile by tile (like k-seqlen loopover)
// TODO: x_end still could be negative, so end-start could be negative(need check)
template <index_t YTile, index_t XTile>
CK_TILE_HOST_DEVICE constexpr auto
GetTileRangeAlongX(index_t i_y, number<YTile>, number<XTile>) const
{
if constexpr(!IsMasking)
{
return ck_tile::make_tuple(0, x_total);
}
else
{
// get the tile start/end range assum we loop over along X tile by tile
index_t x_start = [&]() {
if constexpr(IsLocal)
{
index_t tmp = max(-y + i_y + 1, 0);
return (tmp / XTile) * XTile; // round to tile aligned
}
else
{
return 0;
}
}();
// TODO: end could be negative, we ignore clamp here, and let caller to check
// ... in which case end-start is negative
index_t x_end = [&]() {
index_t tmp = min(i_y + YTile - 1 + x, x_total);
return ((tmp + XTile - 1) / XTile) * XTile;
}();
return ck_tile::make_tuple(x_start, x_end);
}
}
// per-pixel check if out-of-bound, if true, need mask a value(like -INF)
CK_TILE_HOST_DEVICE constexpr auto IsOutOfBound(index_t i_y, index_t i_x) const
{
if constexpr(!IsMasking)
{
return i_x >= x_total;
}
else
{
// no need to do min/max here, since i_x will never be < 0 or >= x_total
index_t x_start = -y + i_y + 1;
index_t x_end = min(i_y + x, x_total);
if constexpr(IsLocal)
{
return i_x < x_start || i_x >= x_end;
}
else
{
return i_x >= x_end;
}
}
}
// if current tile is at the edge, means need per-pixel mask check.
// otherwise no need to check per-pixel
// Attention! assume the idex passed in this function is with in range of GetTileRangeAlongX()
// can be used as a fast-path to decide if do per-pixel check or not
template <index_t TileHeight, index_t TileWidth>
CK_TILE_HOST_DEVICE constexpr auto
IsEdgeTile(index_t i_tile_top, index_t i_tile_left, number<TileHeight>, number<TileWidth>) const
{
if constexpr(IsLocal)
{
// check top-right corner > x or left-borrom corner < x
index_t i_tile_right = i_tile_left + TileWidth;
index_t i_tile_bottom = i_tile_top + TileHeight;
index_t x_end = min(i_tile_top + x, x_total);
bool top_right_edge = i_tile_right > (i_tile_top + x);
bool bottom_left_edge = i_tile_bottom > (i_tile_left + y);
bool is_partial_out_of_bound = i_tile_right > x_end; // only consider right-pad for now
return top_right_edge || bottom_left_edge || is_partial_out_of_bound;
}
else
{
// only need to check top-right corner > x
index_t i_tile_right = i_tile_left + TileWidth;
index_t x_end = min(i_tile_top + x, x_total);
bool top_right_edge = i_tile_right > x_end;
return top_right_edge;
}
}
private:
index_t y, x;
index_t y_total, x_total;
};
// clang-format off
namespace impl {
template <bool IsMasking_> struct SimplifiedMaskName;
template<> struct SimplifiedMaskName<false> { static constexpr const char * name = "nomask"; };
template<> struct SimplifiedMaskName<true> { static constexpr const char * name = "mask"; };
}
// clang-format on
// this version only have 2 variation: masking and non-masking
// This is more friendly to codegen (e.g. need generate less kernel)
// ... with the trade-off that may have more instruction in causal mode
template <bool IsMasking_ = true>
struct SimplifiedGenericAttentionMask
{
static constexpr bool IsMasking = IsMasking_; // false will disable masking
static constexpr const char* name = impl::SimplifiedMaskName<IsMasking>::name;
CK_TILE_HOST_DEVICE SimplifiedGenericAttentionMask(index_t y_total_, index_t x_total_)
: SimplifiedGenericAttentionMask(0, 0, y_total_, x_total_)
{
}
CK_TILE_HOST_DEVICE
SimplifiedGenericAttentionMask(index_t y_, index_t x_, index_t y_total_, index_t x_total_)
: y(y_), x(x_), y_total(y_total_), x_total(x_total_)
{
}
template <typename MaskCoordinates>
CK_TILE_HOST_DEVICE SimplifiedGenericAttentionMask(const MaskCoordinates& mask_coord)
: y(mask_coord.at(number<0>{})),
x(mask_coord.at(number<1>{})),
y_total(mask_coord.at(number<2>{})),
x_total(mask_coord.at(number<3>{}))
{
}
// to get the loop length along X axis, return index:[start, end), end-start=length
// use this if need loop over X axis tile by tile (like k-seqlen loopover)
// TODO: x_end still could be negative, so end-start could be negative(need check)
template <index_t YTile, index_t XTile>
CK_TILE_HOST_DEVICE constexpr auto
GetTileRangeAlongX(index_t i_y, number<YTile>, number<XTile>) const
{
if constexpr(!IsMasking)
{
return ck_tile::make_tuple(0, x_total);
}
else
{
// get the tile start/end range assum we loop over along X tile by tile
index_t x_start = [&]() {
index_t tmp = max(-y + i_y + 1, 0);
return (tmp / XTile) * XTile; // round to tile aligned
}();
// TODO: end could be negative, we ignore clamp here, and let caller to check
// ... in which case end-start is negative
index_t x_end = [&]() {
index_t tmp = min(i_y + YTile - 1 + x, x_total);
return ((tmp + XTile - 1) / XTile) * XTile;
}();
return ck_tile::make_tuple(x_start, x_end);
}
}
// per-pixel check if out-of-bound, if true, need mask a value(like -INF)
CK_TILE_HOST_DEVICE constexpr auto IsOutOfBound(index_t i_y, index_t i_x) const
{
if constexpr(!IsMasking)
{
// the only case that need do following compare is under kPadSeqLenK
// ... for non-masking kernel.
return i_x >= x_total;
}
else
{
index_t x_start = -y + i_y + 1; // this could be negative, but it's fine
index_t x_end = min(i_y + x, x_total); // need min in case x is padded
return i_x < x_start || i_x >= x_end;
}
}
// if current tile is at the edge, means need per-pixel mask check.
// otherwise no need to check per-pixel
// Attention! assume the idex passed in this function is with in range of GetTileRangeAlongX()
// can be used as a fast-path to decide if do per-pixel check or not
template <index_t TileHeight, index_t TileWidth>
CK_TILE_HOST_DEVICE constexpr auto
IsEdgeTile(index_t i_y, index_t i_x, number<TileHeight>, number<TileWidth>) const
{
if constexpr(!IsMasking)
{
// the only case that need do following compare is under kPadSeqLenK
// ... for non-masking kernel.
// return (i_x < x_total) && ((i_x + TileWidth) > x_total);
// TODO: no need to check begin
return (i_x + TileWidth) > x_total;
}
else
{
// check top-right corner > x or left-borrom corner < x
index_t i_x_end = i_x + TileWidth;
index_t i_y_end = i_y + TileHeight;
// index_t x_end = min(i_y + x, x_total);
bool top_right_edge = i_x_end > min(i_y + x, x_total); // consider right pad
bool bottom_left_edge = i_y_end > (i_x + y);
// bool is_partial_out_of_bound = i_x_end > x_end; // only consider right-pad for now
return top_right_edge || bottom_left_edge;
}
}
private:
index_t y, x;
index_t y_total, x_total;
};
// TODO: prefer use this function in host code
// can convert from the FA style left/right to our generic coordinate
// if left_size < 0 && right_size = 0, it is normal causal mask
// local is left_size >=0 or right_size >=0
CK_TILE_HOST_DEVICE constexpr auto
make_generic_attention_mask_coordinates_from_lr_window(index_t left_size,
index_t right_size,
index_t y_total,
index_t x_total,
bool is_top_left = true)
{
// TODO: below should all use sgpr arithmetic
index_t left_size_tmp = is_top_left ? y_total - 1 : x_total - 1;
index_t right_size_tmp = is_top_left ? x_total - 1 : y_total - 1;
left_size = left_size < 0 ? left_size_tmp : left_size;
right_size = right_size < 0 ? right_size_tmp : right_size;
index_t x_tmp = is_top_left ? 0 : x_total - y_total;
index_t y_tmp = is_top_left ? 0 : y_total - x_total;
index_t x = 1 + right_size + x_tmp;
index_t y = 1 + left_size + y_tmp;
return ck_tile::make_tuple(y, x, y_total, x_total);
}
template <typename MaskType>
CK_TILE_HOST_DEVICE constexpr auto
make_generic_attention_mask_from_lr_window(index_t left_size,
index_t right_size,
index_t y_total,
index_t x_total,
bool is_top_left = true)
{
auto r = make_generic_attention_mask_coordinates_from_lr_window(
left_size, right_size, y_total, x_total, is_top_left);
return MaskType{r.at(ck_tile::number<0>{}), r.at(ck_tile::number<1>{}), y_total, x_total};
}
} // namespace ck_tile
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment