Commit 7e44fd84 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed

parent 15965dfc
......@@ -11,7 +11,6 @@
#include "ck/library/utility/host_tensor_generator.hpp"
using F16 = ck::half_t;
using F32 = float;
using ADataType = F16;
using BDataType = F16;
......@@ -21,13 +20,12 @@ using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
4,
3,
1,
3, // NumDim_M
1, // NumDim_N
8,
8,
ck::Sequence<1>,
ck::Sequence<1>>;
ck::Sequence<8>,
ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc,
......@@ -48,10 +46,16 @@ void host_elementwise4D(HostTensorB& B_nhwc,
int main()
{
bool do_verification = true;
bool time_kernel = true;
bool time_kernel = false;
std::vector<std::size_t> nchw = {4, 8, 4, 8};
std::vector<std::size_t> nhwc = {4, 4, 8, 8};
const int N = 16;
const int H = 32;
const int W = 64;
const int C = 128;
std::vector<std::size_t> nchw = {N, C, H, W};
std::vector<std::size_t> nhwc = {N, H, W, C};
Tensor<ADataType> a(nchw);
Tensor<BDataType> b(nhwc);
......@@ -62,28 +66,16 @@ int main()
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
std::array<ck::index_t, 4> ab_lengths;
std::array<ck::index_t, 4> a_strides = {static_cast<int>(nhwc[1] * nhwc[2] * nhwc[3]),
static_cast<int>(nhwc[2]),
1,
static_cast<int>(nhwc[1] * nhwc[2])};
std::array<ck::index_t, 4> b_strides = {static_cast<int>(nhwc[1] * nhwc[2] * nhwc[3]),
static_cast<int>(nhwc[2]*nhwc[3]),
static_cast<int>(nhwc[3]),
1};
// std::cout << "Length: " << ab_lengths << std::endl;
// std::cout << "A stride: " << a_strides << std::endl;
// std::cout << "B stride: " << b_strides << std::endl;
std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
// std::copy(nhwc.begin(), nhwc.end(), ab_lengths.begin());
std::copy(nhwc.begin(), nhwc.end(), ab_lengths.begin());
// std::copy(a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end(), a_strides.begin());
// std::copy(b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end(), b_strides.begin());
std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
auto broadcastPermute = DeviceElementwisePermuteInstance{};
auto argument = broadcastPermute.MakeArgumentPointer(
......@@ -94,6 +86,7 @@ int main()
throw std::runtime_error(
"The runtime parameters seems not supported by the device instance, exiting!");
};
auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
float ave_time =
broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
......@@ -105,12 +98,13 @@ int main()
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nhwc);
host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
host_b, a, nchw, PassThrough{});
LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
}
......
......@@ -20,20 +20,24 @@ namespace device {
template <typename InDataTypeTuple,
typename OutDataTypeTuple,
typename ElementwiseOperation,
index_t NumDim,
index_t NumDim_m,
index_t NumDim_n,
index_t MPerThread,
index_t NPerThread,
typename InScalarPerVectorSeq,
typename OutScalarPerVectorSeq>
struct DeviceElementwise
: public DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
OutDataTypeTuple,
ElementwiseOperation,
NumDim_m + NumDim_n>
{
static constexpr index_t NumDim = NumDim_m + NumDim_n;
static constexpr int NumInput = InDataTypeTuple::Size();
static constexpr int NumOutput = OutDataTypeTuple::Size();
// const index_t NumDim = NumDim_m + NumDim_n;
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static_assert(NumInput == InScalarPerVectorSeq::Size() &&
NumOutput == OutScalarPerVectorSeq::Size(),
......@@ -67,15 +71,16 @@ struct DeviceElementwise
template <typename Desc_MN>
static auto PadDescriptor_MN_2d(Desc_MN desc_mn, index_t gridSize, index_t blockSize)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const auto m = desc_mn.GetLength(I0);
const auto n = desc_mn.GetLength(I1);
const index_t loop_step_m = gridSize * blockSize * MPerThread;
const index_t loop_step_m = MPerThread;
const index_t loop_step_n = gridSize * blockSize * NPerThread;
const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
std::cout << NumDim_m << " m: " << m << " loop_step_m: " << loop_step_m
<< " pad_m: " << pad_m << std::endl;
std::cout << NumDim_n << " n: " << n << " loop_step_n: " << loop_step_n
<< " pad_n: " << pad_n << std::endl;
const auto desc_mn_pad = transform_tensor_descriptor(
desc_mn,
make_tuple(make_right_pad_transform(m, pad_m), make_right_pad_transform(n, pad_n)),
......@@ -137,7 +142,6 @@ struct DeviceElementwise
using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumOutput>{}));
using InGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumInput>{}));
// using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumOutput>{}));
using GridwiseElementwise = GridwiseElementwise_2D<InGrid2dDescTuple,
OutGrid2dDescTuple,
......@@ -165,6 +169,9 @@ struct DeviceElementwise
blockSize_(256),
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
{
static_assert(NumDim_m > 0, "");
static_assert(NumDim_n > 0, "");
in_dev_buffers_ = generate_tuple(
[&](auto I) {
using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
......@@ -257,29 +264,31 @@ struct DeviceElementwise
const std::array<index_t, NumDim>& strides,
index_t scalarPerVector,
index_t vectorDim) {
std::cout << "scalarPerVector: " << scalarPerVector << std::endl;
std::cout << "stride back: " << strides.back() << std::endl;
std::cout << "len back: " << lengths.back() << std::endl;
std::cout << "NumDim-1: " << NumDim - 1 << std::endl;
std::cout << "stride[nd-1]: " << strides[NumDim - 1] << std::endl;
std::cout << "NumDim_m-1: " << NumDim_m - 1 << std::endl;
std::cout << std::endl;
std::cout << "ISPVV Check 1 starting" << std::endl;
if(strides[vectorDim] == 1 && (lengths[vectorDim] % scalarPerVector == 0 || lengths[vectorDim]%scalarPerVector == lengths[vectorDim]))
// std::cout << "scalarPerVector: " << scalarPerVector << std::endl;
// std::cout << "stride back: " << strides.back() << std::endl;
// std::cout << "len back: " << lengths.back() << std::endl;
// std::cout << "NumDim-1: " << NumDim - 1 << std::endl;
// std::cout << "stride[nd-1]: " << strides[NumDim - 1] << std::endl;
// std::cout << "NumDim_m-1: " << NumDim_m - 1 << std::endl;
// std::cout << std::endl;
// std::cout << "ISPVV Check 1 starting" << std::endl;
if(strides[vectorDim] == 1 &&
(lengths[vectorDim] % scalarPerVector == 0 ||
lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
{
std::cout << "Check 1 passed" << std::endl;
// std::cout << "Check 1 passed" << std::endl;
return true;
}
std::cout << "Check 1 failed " << std::endl;
// std::cout << "Check 1 failed " << std::endl;
std::cout << "ISPVV Check 2 starting" << std::endl;
std::cout << "strides[vectorDim]: " << strides[vectorDim] << std::endl;
// std::cout << "ISPVV Check 2 starting" << std::endl;
// std::cout << "strides[vectorDim]: " << strides[vectorDim] << std::endl;
if(strides[vectorDim] != 1 && scalarPerVector == strides[vectorDim])
{
std::cout << "Check 2 passed " << std::endl;
// std::cout << "Check 2 passed " << std::endl;
return true;
}
std::cout << "Check 2 failed" << std::endl;
// std::cout << "Check 2 failed" << std::endl;
return false;
};
......@@ -300,16 +309,20 @@ struct DeviceElementwise
bool valid = true;
static_for<0, NumInput, 1>{}([&](auto I) {
std::cout << "running: " << I << std::endl;
if(!IsScalarPerVectorValid(
pArg->lengths_, pArg->inStridesArray_[I.value], InScalarPerVectorSeq::At(I), NumDim_m - 1))
if(!IsScalarPerVectorValid(pArg->lengths_,
pArg->inStridesArray_[I.value],
InScalarPerVectorSeq::At(I),
NumDim_m - 1))
valid = false;
});
std::cout << "valid after loop through input: " << valid << std::endl;
static_for<0, NumOutput, 1>{}([&](auto I) {
std::cout << "running 2: " << I << std::endl;
if(!IsScalarPerVectorValid(
pArg->lengths_, pArg->outStridesArray_[I.value], OutScalarPerVectorSeq::At(I), NumDim - 1))
if(!IsScalarPerVectorValid(pArg->lengths_,
pArg->outStridesArray_[I.value],
OutScalarPerVectorSeq::At(I),
NumDim - 1))
valid = false;
});
std::cout << "valid after loop through output: " << valid << std::endl;
......
......@@ -102,26 +102,20 @@ struct GridwiseElementwise_2D
Number<NumOutput>{});
const index_t blockSize = get_block_size();
const index_t blockPerGrid_m = get_grid_size();
//const index_t blockPerGrid_n = gridDim.y;
// const index_t block_1d = get_block_1d_id();
const index_t blockPerGrid = get_grid_size();
const index_t totalNumThread = blockSize * blockPerGrid;
const auto M = in_grid_2d_desc_tuple[I0].GetLength(I0);
const auto N = in_grid_2d_desc_tuple[I0].GetLength(I1);
const index_t loop_step_m = blockPerGrid_m * blockSize * MPerThread;
const index_t loop_step_n = blockPerGrid_m * blockSize * NPerThread;
const auto loop_step_index_m = make_multi_index(loop_step_m, 0);
const auto loop_step_index_n = make_multi_index(0, loop_step_n);
const index_t loop_step_m = MPerThread;
const index_t loop_step_n = totalNumThread * NPerThread;
const index_t thread_1d_id = get_thread_global_1d_id();
index_t tid_m = thread_1d_id/(N/NPerThread);
index_t tid_n = thread_1d_id%(N/NPerThread);
//index_t tid_m = thread_1d_id;
//index_t tid_n = blockDim.y * blockIdx.y + threadIdx.y;
const auto thread_global_offset = make_multi_index(tid_m* MPerThread, tid_n* NPerThread);
// make_multi_index(thread_global_id_2d[I0] * MPerThread, thread_global_id_2d[I1] *
// NPerThread);
// index_t tid_m = thread_1d_id / (N / NPerThread);
// index_t tid_n = thread_1d_id % (N / NPerThread);
const auto thread_global_offset = make_multi_index(0, thread_1d_id * NPerThread);
auto in_global_load_tuple = generate_tuple(
[&](auto I) {
......@@ -135,10 +129,10 @@ struct GridwiseElementwise_2D
decltype(thread_buffer_desc_mn),
Sequence<MPerThread, NPerThread>, // SliceLengths
Sequence<0, 1>, // DimAccessOrder
1, // SrcVectorDim
0, // SrcVectorDim
InScalarPerVectorSeq::At(I), // ScalarPerVector
1, // SrcScalarStrideInVector
false>{in_grid_2d_desc_tuple[I], thread_global_offset};
true>{in_grid_2d_desc_tuple[I], thread_global_offset};
},
Number<NumInput>{});
......@@ -154,21 +148,21 @@ struct GridwiseElementwise_2D
decltype(out_grid_2d_desc_tuple[I]),
PassThroughOp,
Sequence<MPerThread, NPerThread>, // SliceLengths
Sequence<1, 0>, // DimAccessOrder
0, // SrcVectorDim
OutScalarPerVectorSeq::At(I),
Sequence<0, 1>, // DimAccessOrder
1, // SrcVectorDim
1, // OutScalarPerVectorSeq::At(I),
InMemoryDataOperationEnum::Set,
1,
false>(out_grid_2d_desc_tuple[I], thread_global_offset, PassThroughOp{});
true>(out_grid_2d_desc_tuple[I], thread_global_offset, PassThroughOp{});
},
Number<NumOutput>{});
index_t num_iter_m = M / (loop_step_m);
index_t num_iter_n = N / (loop_step_n);
do
{
index_t num_iter_n = N / (loop_step_n);
do
{
static_for<0, NumInput, 1>{}([&](auto I) {
in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I],
in_global_buf_tuple[I],
......@@ -177,7 +171,7 @@ struct GridwiseElementwise_2D
in_thread_buf_tuple(I));
in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I],
loop_step_index_n);
make_multi_index(0, loop_step_n));
});
static_for<0, MPerThread, 1>{}([&](auto iM) {
......@@ -203,6 +197,9 @@ struct GridwiseElementwise_2D
});
});
// static_for<0, MPerThread * NPerThread, 1>{}(
//[&](auto i) { out_thread_buf_tuple(I0)(i) = 1; });
static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).Run(thread_buffer_desc_mn,
make_tuple(I0, I0),
......@@ -211,14 +208,21 @@ struct GridwiseElementwise_2D
out_global_buf_tuple(I));
out_global_store_tuple(I).MoveDstSliceWindow(out_grid_2d_desc_tuple[I],
loop_step_index_n);
make_multi_index(0, loop_step_n));
});
} while(--num_iter_n);
static_for<0, NumInput, 1>{}([&](auto I) {
in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I], loop_step_index_m);
in_global_load_tuple(I).MoveSrcSliceWindow(
in_grid_2d_desc_tuple[I],
make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n));
});
static_for<0, NumOutput, 1>{}([&](auto I){
out_global_store_tuple(I).MoveDstSliceWindow(out_grid_2d_desc_tuple[I], loop_step_index_m);
static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_2d_desc_tuple[I],
make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n));
});
} while(--num_iter_m);
}
......
......@@ -149,6 +149,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
// if(get_thread_global_1d_id() == 0)
//{
// const index_t dst_off = dst_coord_.GetOffset();
// printf("dst_off: %d\n", dst_off);
//}
// copy data from dst_vector into dst_buf
dst_buf.template Update<DstInMemOp, dst_vector_t>(
dst_coord_.GetOffset(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment