Commit 43c22b57 authored by Jianfeng yan's avatar Jianfeng yan
Browse files

threadwise_copy_v6r1/v6r2/v6r3 using space-filling curve start to work

parent a9b1061c
......@@ -5,7 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r1.hpp"
// #include "threadwise_tensor_slice_transfer_v6r1.hpp"
#include "threadwise_tensor_slice_transfer_v6r1_sfcurve.hpp"
namespace ck {
......
......@@ -5,7 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r2.hpp"
// #include "threadwise_tensor_slice_transfer_v6r2.hpp"
#include "threadwise_tensor_slice_transfer_v6r2_sfcurve.hpp"
namespace ck {
......
......@@ -5,7 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r3.hpp"
// #include "threadwise_tensor_slice_transfer_v6r3.hpp"
#include "threadwise_tensor_slice_transfer_v6r3_sfcurve.hpp"
namespace ck {
......
......@@ -7,7 +7,7 @@
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "blockwise_tensor_slice_transfer_v4r1.hpp"
#include "threadwise_tensor_slice_transfer_using_space_filling_curve.hpp"
#include "threadwise_tensor_slice_transfer_sfcurve.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
namespace ck {
......
......@@ -38,8 +38,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve(
const DstDesc& dst_desc,
const Index& dst_slice_origin_idx,
......@@ -166,8 +164,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
__device__ static constexpr auto GetDstCoordinateResetStep()
{
constexpr auto I0 = Number<0>{};
constexpr auto dst_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
......@@ -177,7 +173,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
constexpr auto reset_step =
SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, I0);
SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});
return reset_step;
}
......
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_SFCURVE_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_SFCURVE_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
namespace ck {
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions:
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
// instead
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
// tensor coordinate instead
// 3. Don't use a pointer to VGPR buffer, use vector instead
// Assume:
// 1. src_desc and dst_desc are not known at compile-time
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename ElementwiseOperation,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorDim,
index_t ScalarPerVector,
InMemoryDataOperationEnum_t DstInMemOp,
bool SrcResetCoordinateAfterRun,
bool DstResetCoordinateAfterRun>
struct ThreadwiseTensorSliceTransfer_v6r1
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
static constexpr auto I0 = Number<0>{};
__device__ constexpr ThreadwiseTensorSliceTransfer_v6r1(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin,
const ElementwiseOperation& element_op)
: src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
element_op_(element_op)
{
static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
"wrong! cannot evenly divide");
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
template <typename SrcBuffer, typename DstBuffer>
__device__ void Run(const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const DstDesc& dst_desc,
DstBuffer& dst_buf)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
// loop over space-filling curve
constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
using src_vector_type = vector_type_maker_t<SrcData, ScalarPerVector>;
using src_vector_t = typename src_vector_type::type;
using dst_vector_type = vector_type_maker_t<DstData, ScalarPerVector>;
using dst_vector_t = typename dst_vector_type::type;
const bool is_src_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
// copy data from src_buf into src_vector_container
auto src_vector_container = src_vector_type{
src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
auto dst_vector_container = dst_vector_type{};
// apply pointwise operation
static_for<0, ScalarPerVector, 1>{}([&](auto i) {
element_op_(dst_vector_container.template AsType<DstData>()(i),
src_vector_container.template AsType<SrcData>()[i]);
});
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
// copy data from dst_vector into dst_buf
dst_buf.template Transfer<DstInMemOp, dst_vector_t>(
dst_coord_.GetOffset(),
is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>()[I0]);
// move coordinate
if constexpr(idx_1d.value != num_accesses - 1)
{
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
move_tensor_coordinate(
src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
move_tensor_coordinate(
dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
}
});
// move coordinate back to slice origin (or not)
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_reset_step =
make_tensor_coordinate_step(src_desc, GetCoordinateResetStep());
move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
}
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_reset_step =
make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
}
}
__device__ static constexpr auto GetCoordinateResetStep()
{
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
constexpr auto reset_step =
SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});
return reset_step;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
}
private:
SrcCoord src_coord_;
DstCoord dst_coord_;
const ElementwiseOperation element_op_;
}; // namespace ck
} // namespace ck
#endif
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_SFCURVE_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_SFCURVE_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
namespace ck {
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions:
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
// instead
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
// tensor coordinate instead
// 3. Don't use a pointer to VGPR buffer, use vector instead
// Assume:
// 1. src0_desc and dst_desc are not known at compile-time
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
template <typename Src0Data,
typename Src1Data,
typename DstData,
typename Src0Desc,
typename Src1Desc,
typename DstDesc,
typename ElementwiseOperation,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorDim,
index_t ScalarPerVector,
InMemoryDataOperationEnum_t DstInMemOp,
bool Src0ResetCoordinateAfterRun,
bool Src1ResetCoordinateAfterRun,
bool DstResetCoordinateAfterRun>
struct ThreadwiseTensorSliceTransfer_v6r2
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using Src0Coord = decltype(make_tensor_coordinate(Src0Desc{}, Index{}));
using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{}));
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
static constexpr auto I0 = Number<0>{};
__device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc& src0_desc,
const Index& src0_slice_origin,
const Src1Desc& src1_desc,
const Index& src1_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin,
const ElementwiseOperation& element_op)
: src0_coord_(make_tensor_coordinate(src0_desc, src0_slice_origin)),
src1_coord_(make_tensor_coordinate(src1_desc, src1_slice_origin)),
dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
element_op_(element_op)
{
static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
"wrong! cannot evenly divide");
}
__device__ void SetSrc0SliceOrigin(const Src0Desc& src0_desc,
const Index& src0_slice_origin_idx)
{
src0_coord_ = make_tensor_coordinate(src0_desc, src0_slice_origin_idx);
}
__device__ void SetSrc1SliceOrigin(const Src1Desc& src1_desc,
const Index& src1_slice_origin_idx)
{
src1_coord_ = make_tensor_coordinate(src1_desc, src1_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
template <typename Src0Buffer, typename Src1Buffer, typename DstBuffer>
__device__ void Run(const Src0Desc& src0_desc,
const Src0Buffer& src0_buf,
const Src1Desc& src1_desc,
const Src1Buffer& src1_buf,
const DstDesc& dst_desc,
DstBuffer& dst_buf)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
// loop over space-filling curve
static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
using src0_vector_t = typename src0_vector_type::type;
using src1_vector_type = vector_type_maker_t<Src1Data, ScalarPerVector>;
using src1_vector_t = typename src1_vector_type::type;
using dst_vector_type = vector_type_maker_t<DstData, ScalarPerVector>;
using dst_vector_t = typename dst_vector_type::type;
const bool is_src0_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src0_desc, src0_coord_);
const bool is_src1_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src1_desc, src1_coord_);
// copy data from src0_buf into src0_vector_container
auto src0_vector_container = src0_vector_type{
src0_buf.template Get<src0_vector_t>(src0_coord_.GetOffset(), is_src0_valid)};
auto src1_vector_container = src1_vector_type{
src1_buf.template Get<src1_vector_t>(src1_coord_.GetOffset(), is_src1_valid)};
auto dst_vector_container = dst_vector_type{};
// apply pointwise operation
static_for<0, ScalarPerVector, 1>{}([&](auto i) {
element_op_(dst_vector_container.template AsType<DstData>()(i),
src0_vector_container.template AsType<Src0Data>()[i],
src1_vector_container.template AsType<Src1Data>()[i]);
});
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
// copy data from dst_vector into dst_buf
dst_buf.template Transfer<DstInMemOp, dst_vector_t>(
dst_coord_.GetOffset(),
is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>()[I0]);
// move coordinate
if constexpr(idx_1d.value != num_accesses - 1)
{
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
move_tensor_coordinate(
src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
move_tensor_coordinate(
src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
move_tensor_coordinate(
dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
}
});
// move coordinate back to slice origin (or not)
if constexpr(Src0ResetCoordinateAfterRun)
{
const auto src0_reset_step =
make_tensor_coordinate_step(src0_desc, GetCoordinateResetStep());
move_tensor_coordinate(src0_desc, src0_coord_, src0_reset_step);
}
if constexpr(Src1ResetCoordinateAfterRun)
{
const auto src1_reset_step =
make_tensor_coordinate_step(src1_desc, GetCoordinateResetStep());
move_tensor_coordinate(src1_desc, src1_coord_, src1_reset_step);
}
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_reset_step =
make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
}
}
__device__ static constexpr auto GetCoordinateResetStep()
{
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
constexpr auto reset_step =
SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});
return reset_step;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc,
const Index& src0_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
? src0_slice_origin_step_idx
: src0_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src0_desc, adjusted_step_idx);
move_tensor_coordinate(src0_desc, src0_coord_, adjusted_step);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc,
const Index& src1_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
? src1_slice_origin_step_idx
: src1_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src1_desc, adjusted_step_idx);
move_tensor_coordinate(src1_desc, src1_coord_, adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
}
private:
Src0Coord src0_coord_;
Src1Coord src1_coord_;
DstCoord dst_coord_;
const ElementwiseOperation element_op_;
};
} // namespace ck
#endif
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_SPCURVE_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_SPCURVE_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_space_filling_curve.hpp"
namespace ck {
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions:
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
// instead
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
// tensor coordinate instead
// 3. Don't use a pointer to VGPR buffer, use vector instead
// Assume:
// 1. src0_desc and dst_desc are not known at compile-time
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
template <typename Src0Data,
typename Src1Data,
typename Src2Data,
typename DstData,
typename Src0Desc,
typename Src1Desc,
typename Src2Desc,
typename DstDesc,
typename ElementwiseOperation,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorDim,
index_t ScalarPerVector,
InMemoryDataOperationEnum_t DstInMemOp,
bool Src0ResetCoordinateAfterRun,
bool Src1ResetCoordinateAfterRun,
bool Src2ResetCoordinateAfterRun,
bool DstResetCoordinateAfterRun>
struct ThreadwiseTensorSliceTransfer_v6r3
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using Src0Coord = decltype(make_tensor_coordinate(Src0Desc{}, Index{}));
using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{}));
using Src2Coord = decltype(make_tensor_coordinate(Src2Desc{}, Index{}));
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
static constexpr auto I0 = Number<0>{};
__device__ constexpr ThreadwiseTensorSliceTransfer_v6r3(const Src0Desc& src0_desc,
const Index& src0_slice_origin,
const Src1Desc& src1_desc,
const Index& src1_slice_origin,
const Src2Desc& src2_desc,
const Index& src2_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin,
const ElementwiseOperation& element_op)
: src0_coord_(make_tensor_coordinate(src0_desc, src0_slice_origin)),
src1_coord_(make_tensor_coordinate(src1_desc, src1_slice_origin)),
src2_coord_(make_tensor_coordinate(src2_desc, src2_slice_origin)),
dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
element_op_(element_op)
{
static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
"wrong! cannot evenly divide");
}
__device__ void SetSrc0SliceOrigin(const Src0Desc& src0_desc,
const Index& src0_slice_origin_idx)
{
src0_coord_ = make_tensor_coordinate(src0_desc, src0_slice_origin_idx);
}
__device__ void SetSrc1SliceOrigin(const Src1Desc& src1_desc,
const Index& src1_slice_origin_idx)
{
src1_coord_ = make_tensor_coordinate(src1_desc, src1_slice_origin_idx);
}
__device__ void SetSrc2SliceOrigin(const Src2Desc& src2_desc,
const Index& src2_slice_origin_idx)
{
src2_coord_ = make_tensor_coordinate(src2_desc, src2_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
template <typename Src0Buffer, typename Src1Buffer, typename Src2Buffer, typename DstBuffer>
__device__ void Run(const Src0Desc& src0_desc,
const Src0Buffer& src0_buf,
const Src1Desc& src1_desc,
const Src1Buffer& src1_buf,
const Src2Desc& src2_desc,
const Src2Buffer& src2_buf,
const DstDesc& dst_desc,
DstBuffer& dst_buf)
{
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
// loop over space-filling curve
static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
using src0_vector_t = typename src0_vector_type::type;
using src1_vector_type = vector_type_maker_t<Src1Data, ScalarPerVector>;
using src1_vector_t = typename src1_vector_type::type;
using src2_vector_type = vector_type_maker_t<Src2Data, ScalarPerVector>;
using src2_vector_t = typename src2_vector_type::type;
using dst_vector_type = vector_type_maker_t<DstData, ScalarPerVector>;
using dst_vector_t = typename dst_vector_type::type;
const bool is_src0_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src0_desc, src0_coord_);
const bool is_src1_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src1_desc, src1_coord_);
const bool is_src2_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src2_desc, src2_coord_);
// copy data from src0_buf into src0_vector_container
auto src0_vector_container = src0_vector_type{
src0_buf.template Get<src0_vector_t>(src0_coord_.GetOffset(), is_src0_valid)};
auto src1_vector_container = src1_vector_type{
src1_buf.template Get<src1_vector_t>(src1_coord_.GetOffset(), is_src1_valid)};
auto src2_vector_container = src2_vector_type{
src2_buf.template Get<src2_vector_t>(src2_coord_.GetOffset(), is_src2_valid)};
auto dst_vector_container = dst_vector_type{};
// apply pointwise operation
static_for<0, ScalarPerVector, 1>{}([&](auto i) {
element_op_(dst_vector_container.template AsType<DstData>()(i),
src0_vector_container.template AsType<Src0Data>()[i],
src1_vector_container.template AsType<Src1Data>()[i],
src2_vector_container.template AsType<Src2Data>()[i]);
});
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
dst_buf.template Transfer<DstInMemOp, dst_vector_t>(
dst_coord_.GetOffset(),
is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>()[I0]);
// move coordinate
if constexpr(idx_1d.value != num_accesses - 1)
{
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
move_tensor_coordinate(
src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
move_tensor_coordinate(
src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
move_tensor_coordinate(
src2_desc, src2_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
move_tensor_coordinate(
dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
}
});
// move coordinate back to slice origin (or not)
if constexpr(Src0ResetCoordinateAfterRun)
{
const auto src0_reset_step =
make_tensor_coordinate_step(src0_desc, GetCoordinateResetStep());
move_tensor_coordinate(src0_desc, src0_coord_, src0_reset_step);
}
if constexpr(Src1ResetCoordinateAfterRun)
{
const auto src1_reset_step =
make_tensor_coordinate_step(src1_desc, GetCoordinateResetStep());
move_tensor_coordinate(src1_desc, src1_coord_, src1_reset_step);
}
if constexpr(Src2ResetCoordinateAfterRun)
{
const auto src2_reset_step =
make_tensor_coordinate_step(src2_desc, GetCoordinateResetStep());
move_tensor_coordinate(src2_desc, src2_coord_, src2_reset_step);
}
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_reset_step =
make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
}
}
__device__ static constexpr auto GetCoordinateResetStep()
{
constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
DimAccessOrder,
remove_cv_t<decltype(scalar_per_access)>>;
constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
constexpr auto reset_step =
SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});
return reset_step;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc,
const Index& src0_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
? src0_slice_origin_step_idx
: src0_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src0_desc, adjusted_step_idx);
move_tensor_coordinate(src0_desc, src0_coord_, adjusted_step);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc,
const Index& src1_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
? src1_slice_origin_step_idx
: src1_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src1_desc, adjusted_step_idx);
move_tensor_coordinate(src1_desc, src1_coord_, adjusted_step);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrc2SliceWindow(const Src2Desc& src2_desc,
const Index& src2_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = Src2ResetCoordinateAfterRun
? src2_slice_origin_step_idx
: src2_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src2_desc, adjusted_step_idx);
move_tensor_coordinate(src2_desc, src2_coord_, adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
}
private:
Src0Coord src0_coord_;
Src1Coord src1_coord_;
Src2Coord src2_coord_;
DstCoord dst_coord_;
const ElementwiseOperation element_op_;
};
} // namespace ck
#endif
#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
#define TENSOR_SPACE_FILLING_CURVE_HPP
#include "math.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
......@@ -139,3 +142,4 @@ struct SpaceFillingCurve
};
} // namespace ck
#endif
......@@ -211,6 +211,10 @@ int main(int argc, char* argv[])
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
{
memset(out_n_k_ho_wo_device_result.mData.data(), static_cast<OutDataType>(0), out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
out_device_buf.ToDevice(out_n_k_ho_wo_device_result.mData.data());
}
// do GEMM
auto conv = DeviceConvFwdInstance{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment