Commit 05ee41c3 authored by Rosty Geyyer's avatar Rosty Geyyer
Browse files

Merge branch 'develop' into lwpck-471

parents 37116c98 ad541ad6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
template <index_t Rank, index_t Reduce>
using device_softmax_f32_f32_instances = std::tuple<
// clang-format off
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>,
// Reduction on middle dimensions
// InSrcVectorDim is 0 since we want to coalesce reads on M dimension
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 4, 0, 1, 1>,
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 4, 0, 4, 4>
// clang-format on
>;
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
void add_device_softmax_i8_i8_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
template <index_t Rank, index_t Reduce>
using device_softmax_i8_i8_instances = std::tuple<
// clang-format off
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
// fallback kernel
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 1, 1>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 16, 16>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 16, 1, 16, 16>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 16, 16>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 16, 16>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 64, 1, 16, 16>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 16, 16>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 16, 16>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 64, 1, 16, 16>,
// Reduction on middle dimensions
// InSrcVectorDim is 0 since we want to coalesce reads on M dimension
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 8, 0, 1, 1>,
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 32, 8, 32, 8, 0, 16, 8>
// clang-format on
>;
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <iterator>
#include <type_traits>
#include <utility>
namespace ck {
namespace ranges {
template <typename InputRange, typename OutputIterator>
auto copy(InputRange&& range, OutputIterator iter)
-> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
std::end(std::forward<InputRange>(range)),
iter))
{
return std::copy(std::begin(std::forward<InputRange>(range)),
std::end(std::forward<InputRange>(range)),
iter);
}
template <typename T, typename OutputRange>
auto fill(OutputRange&& range, const T& init)
-> std::void_t<decltype(std::fill(std::begin(std::forward<OutputRange>(range)),
std::end(std::forward<OutputRange>(range)),
init))>
{
std::fill(std::begin(std::forward<OutputRange>(range)),
std::end(std::forward<OutputRange>(range)),
init);
}
template <typename InputRange, typename OutputIterator, typename UnaryOperation>
auto transform(InputRange&& range, OutputIterator iter, UnaryOperation unary_op)
-> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
{
return std::transform(std::begin(range), std::end(range), iter, unary_op);
}
} // namespace ranges
} // namespace ck
......@@ -15,18 +15,22 @@
#include "ck/ck.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/span.hpp"
#include "ck/utility/type.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/ranges.hpp"
namespace ck {
namespace utils {
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
bool>::type
check_err(const std::vector<T>& out,
const std::vector<T>& ref,
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_floating_point_v<ranges::range_value_t<Range>> &&
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
bool>::type
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-5,
double atol = 3e-6)
......@@ -44,15 +48,17 @@ check_err(const std::vector<T>& out,
double max_err = std::numeric_limits<double>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
err = std::abs(out[i] - ref[i]);
if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
const double o = *std::next(std::begin(out), i);
const double r = *std::next(std::begin(ref), i);
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl;
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
......@@ -64,10 +70,13 @@ check_err(const std::vector<T>& out,
return res;
}
template <typename T>
typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
check_err(const std::vector<T>& out,
const std::vector<T>& ref,
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, bhalf_t>,
bool>::type
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3)
......@@ -86,9 +95,9 @@ check_err(const std::vector<T>& out,
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
double o = type_convert<float>(out[i]);
double r = type_convert<float>(ref[i]);
err = std::abs(o - r);
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
......@@ -108,10 +117,13 @@ check_err(const std::vector<T>& out,
return res;
}
template <typename T>
typename std::enable_if<std::is_same_v<T, half_t>, bool>::type
check_err(span<const T> out,
span<const T> ref,
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, half_t>,
bool>::type
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3)
......@@ -126,12 +138,12 @@ check_err(span<const T> out,
bool res{true};
int err_count = 0;
double err = 0;
double max_err = std::numeric_limits<T>::min();
double max_err = std::numeric_limits<ranges::range_value_t<Range>>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
double o = type_convert<float>(out[i]);
double r = type_convert<float>(ref[i]);
err = std::abs(o - r);
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
......@@ -151,26 +163,17 @@ check_err(span<const T> out,
return res;
}
template <typename T>
typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
check_err(const std::vector<T>& out,
const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3)
{
return check_err(span<const T>{out}, span<const T>{ref}, msg, rtol, atol);
}
template <typename T>
std::enable_if_t<(std::is_integral_v<T> && !std::is_same_v<T, bhalf_t>)
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_integral_v<ranges::range_value_t<Range>> &&
!std::is_same_v<ranges::range_value_t<Range>, bhalf_t>)
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|| std::is_same_v<T, int4_t>
|| std::is_same_v<ranges::range_value_t<Range>, int4_t>
#endif
,
bool>
check_err(const std::vector<T>& out,
const std::vector<T>& ref,
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double = 0,
double atol = 0)
......@@ -188,9 +191,9 @@ check_err(const std::vector<T>& out,
int64_t max_err = std::numeric_limits<int64_t>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
int64_t o = out[i];
int64_t r = ref[i];
err = std::abs(o - r);
const int64_t o = *std::next(std::begin(out), i);
const int64_t r = *std::next(std::begin(ref), i);
err = std::abs(o - r);
if(err > atol)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment