Commit 212b9299 authored by aska-0096's avatar aska-0096
Browse files

Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance
parents 06903279 d3adc665
#pragma once
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -39,7 +39,7 @@ __global__ void
const CDEElementwiseOperation c_element_op)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__))
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
const index_t block_id = get_block_1d_id();
......@@ -548,11 +548,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
}
}
hipGetErrorString(
hipMemcpy(arg.p_workspace_,
arg.gemm_desc_kernel_arg_.data(),
arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
hipMemcpyHostToDevice));
hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
arg.gemm_desc_kernel_arg_.data(),
arg.gemm_desc_kernel_arg_.size() *
sizeof(GemmBiasTransKernelArg),
hipMemcpyHostToDevice,
stream_config.stream_id_));
float ave_time = 0;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -35,7 +35,7 @@ __global__ void
const index_t group_count)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__))
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
__shared__ uint8_t p_shared[shared_size];
......@@ -406,10 +406,12 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
}
}
hip_check_error(hipMemcpy(arg.p_workspace_,
arg.gemm_kernel_args_.data(),
arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
hipMemcpyHostToDevice));
hip_check_error(
hipMemcpyWithStream(arg.p_workspace_,
arg.gemm_kernel_args_.data(),
arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
hipMemcpyHostToDevice,
stream_config.stream_id_));
float ave_time = 0;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
// output[indices] = input
template <typename DOutDataType,
typename IndexDataType,
typename DInDataType,
ck::index_t InOutVectorSize>
struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDataType, DInDataType>
{
using DInDataType_AutomicAddPreCast =
conditional_t<is_same_v<DInDataType, float> || is_same_v<DInDataType, double>,
DInDataType,
float>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using UnaryConvert = ck::tensor_operation::element_wise::UnaryConvert;
static constexpr auto I0 = Number<0>{};
template <typename Desc_M>
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t loop_step)
{
const auto m = desc_m.GetLength(I0);
const auto pad = math::integer_least_multiple(m, loop_step) - m;
const auto desc_m_pad =
transform_tensor_descriptor(desc_m,
make_tuple(make_right_pad_transform(m, pad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
return desc_m_pad;
}
static auto MakeDescriptor_M(index_t length, index_t loop_step)
{
const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
return PadDescriptor_M_1d(desc_m, loop_step);
}
using InOutGrid1dDesc = decltype(MakeDescriptor_M(1, 1));
using GridwisePutElementSet = GridwisePutElement_1D<InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough,
InMemoryDataOperationEnum::Set,
InOutVectorSize>;
using GridwisePutElementAtomicAdd = GridwisePutElement_1D<InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType_AutomicAddPreCast,
PassThrough,
InMemoryDataOperationEnum::AtomicAdd,
InOutVectorSize>;
using GridwiseCasting = GridwiseElementwise_1D<Tuple<InOutGrid1dDesc>,
Tuple<InOutGrid1dDesc>,
Tuple<const DInDataType_AutomicAddPreCast*>,
Tuple<DInDataType*>,
UnaryConvert,
InOutVectorSize,
Sequence<InOutVectorSize>,
Sequence<InOutVectorSize>>;
struct Argument : public BaseArgument
{
Argument(const DOutDataType* p_dout,
const IndexDataType* p_indices,
DInDataType* p_din,
index_t dout_length,
index_t din_length,
const std::vector<ck::index_t>& window_lengths,
const std::vector<ck::index_t>& window_strides)
: p_dout_{p_dout},
p_indices_{p_indices},
p_din_{p_din},
dout_length_raw_{dout_length},
din_length_raw_{din_length},
blockSize_{256},
windowOverlap_{false}
{
for(size_t i = 0; i < window_lengths.size(); ++i)
{
windowOverlap_ |= window_lengths.at(i) > window_strides.at(i);
}
}
const DOutDataType* p_dout_;
const IndexDataType* p_indices_;
DInDataType* p_din_;
index_t dout_length_raw_;
index_t din_length_raw_;
index_t blockSize_;
bool windowOverlap_;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
index_t gridSize = getAvailableComputeUnitCount(stream_config);
index_t loop_step = gridSize * arg.blockSize_ * InOutVectorSize;
InOutGrid1dDesc din_grid_desc = MakeDescriptor_M(arg.din_length_raw_, loop_step);
InOutGrid1dDesc dout_grid_desc = MakeDescriptor_M(arg.dout_length_raw_, loop_step);
if constexpr(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>)
{
hip_check_error(hipMemsetAsync(arg.p_din_,
0,
arg.din_length_raw_ * sizeof(DInDataType),
stream_config.stream_id_));
if(arg.windowOverlap_)
{
const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough>;
return launch_and_time_kernel(stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
arg.p_din_,
PassThrough{});
}
else
{
const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough>;
return launch_and_time_kernel(stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
arg.p_din_,
PassThrough{});
}
}
else
{
if(arg.windowOverlap_)
{
if(arg.p_workspace_ == nullptr)
throw std::runtime_error("wrong! WorkSpace pointer has not been set");
hip_check_error(
hipMemsetAsync(arg.p_workspace_,
0,
arg.din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast),
stream_config.stream_id_));
const auto put_kernel = kernel_put_element_1d<GridwisePutElementAtomicAdd,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType_AutomicAddPreCast,
PassThrough>;
const auto cast_kernel =
kernel_elementwise_1d<GridwiseCasting,
Tuple<InOutGrid1dDesc>,
Tuple<InOutGrid1dDesc>,
Tuple<const DInDataType_AutomicAddPreCast*>,
Tuple<DInDataType*>,
UnaryConvert>;
float elapsed_time = launch_and_time_kernel(
stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
PassThrough{});
elapsed_time += launch_and_time_kernel(
stream_config,
cast_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
ck::make_tuple(din_grid_desc),
ck::make_tuple(din_grid_desc),
static_cast<DInDataType_AutomicAddPreCast*>(arg.p_workspace_),
arg.p_din_,
UnaryConvert{});
return elapsed_time;
}
else
{
const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
InOutGrid1dDesc,
DOutDataType,
IndexDataType,
DInDataType,
PassThrough>;
hip_check_error(hipMemsetAsync(arg.p_din_,
0,
arg.din_length_raw_ * sizeof(DInDataType),
stream_config.stream_id_));
return launch_and_time_kernel(stream_config,
put_kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
dout_grid_desc,
arg.p_dout_,
arg.p_indices_,
arg.p_din_,
PassThrough{});
}
}
}
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
{
const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
bool needCast = pArg_->windowOverlap_ &&
!(is_same_v<DInDataType, float> || is_same_v<DInDataType, double>);
if(!needCast)
return 0;
else
return pArg_->din_length_raw_ * sizeof(DInDataType_AutomicAddPreCast);
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if(pArg->din_length_raw_ % InOutVectorSize != 0 ||
pArg->dout_length_raw_ % InOutVectorSize != 0)
{
return false;
}
return true;
}
std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_dout,
const void* p_indices,
void* p_din,
index_t dout_length,
index_t din_length,
std::vector<ck::index_t> window_lengths,
std::vector<ck::index_t> window_strides) override
{
// Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
// physical size of the packed tensor
return std::make_unique<Argument>(static_cast<const DOutDataType*>(p_dout),
static_cast<const IndexDataType*>(p_indices),
static_cast<DInDataType*>(p_din),
dout_length,
din_length,
window_lengths,
window_strides);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>(Invoker{});
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_put_element.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
// output[indices] = input
template <typename InDataType,
typename IndexDataType,
typename OutDataType,
typename ElementwiseOperation,
InMemoryDataOperationEnum MemOp,
ck::index_t InVectorSize>
struct DevicePutElementImpl
: public DevicePutElement<InDataType, IndexDataType, OutDataType, ElementwiseOperation, MemOp>
{
template <typename Desc_M>
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
{
constexpr auto I0 = Number<0>{};
const auto m = desc_m.GetLength(I0);
const index_t loop_step = gridSize * blockSize * InVectorSize;
const auto pad = math::integer_least_multiple(m, loop_step) - m;
const auto desc_m_pad =
transform_tensor_descriptor(desc_m,
make_tuple(make_right_pad_transform(m, pad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
return desc_m_pad;
}
static auto MakeDescriptor_M(index_t length, index_t gridSize, index_t blockSize)
{
const auto desc_m = make_naive_tensor_descriptor_packed(make_tuple(length));
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
}
using InGrid1dDesc = decltype(MakeDescriptor_M(1, 1, 1));
using GridwisePutElement = GridwisePutElement_1D<InGrid1dDesc,
InDataType,
IndexDataType,
OutDataType,
ElementwiseOperation,
MemOp,
InVectorSize>;
struct Argument : public BaseArgument
{
Argument(const InDataType* p_input,
const IndexDataType* p_indices,
OutDataType* p_output,
index_t input_length,
ElementwiseOperation elementwise_op)
: p_input_{p_input},
p_indices_{p_indices},
p_output_{p_output},
input_length_raw_{input_length},
elementwise_op_{elementwise_op},
blockSize_{256}
{
}
const InDataType* p_input_;
const IndexDataType* p_indices_;
OutDataType* p_output_;
index_t input_length_raw_;
ElementwiseOperation elementwise_op_;
index_t blockSize_;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
index_t gridSize = getAvailableComputeUnitCount(stream_config);
InGrid1dDesc in_grid_desc =
MakeDescriptor_M(arg.input_length_raw_, gridSize, arg.blockSize_);
const auto kernel = kernel_put_element_1d<GridwisePutElement,
InGrid1dDesc,
InDataType,
IndexDataType,
OutDataType,
ElementwiseOperation>;
float elapsed_time = launch_and_time_kernel(stream_config,
kernel,
dim3(gridSize),
dim3(arg.blockSize_),
0,
in_grid_desc,
arg.p_input_,
arg.p_indices_,
arg.p_output_,
arg.elementwise_op_);
return elapsed_time;
}
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if(pArg->input_length_raw_ % InVectorSize != 0)
{
return false;
}
return true;
}
std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_input,
const void* p_indices,
void* p_output,
index_t input_length,
index_t,
ElementwiseOperation elementwise_op) override
{
return std::make_unique<Argument>(static_cast<const InDataType*>(p_input),
static_cast<const IndexDataType*>(p_indices),
static_cast<OutDataType*>(p_output),
input_length,
elementwise_op);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>(Invoker{});
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
OutDataType,
InElementwiseOp,
AccElementwiseOp,
Rank>
Rank,
NumReduceDim>
{
static constexpr index_t kRank = Rank;
static constexpr index_t kNumReduceDim = NumReduceDim;
static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
virtual index_t GetRank() const override { return kRank; }
virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
static constexpr index_t NumSrcDim = Rank;
......@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
{
if constexpr(InSrcVectorDim == 0)
{
if constexpr(kNumInvariantDim == 0)
if constexpr(NumInvariantDim == 0)
{
return false;
}
else
{
if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
{
return false;
}
......@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
}
// To improve
if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
{
return false;
}
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment