Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
05ee41c3
Commit
05ee41c3
authored
Nov 30, 2022
by
Rosty Geyyer
Browse files
Merge branch 'develop' into lwpck-471
parents
37116c98
ad541ad6
Changes
436
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
505 additions
and
43 deletions
+505
-43
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
...ance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
+38
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
...on_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
...stance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
+40
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
...peration_instance/gpu/softmax/device_softmax_instance.hpp
+8
-0
library/include/ck/library/utility/algorithm.hpp
library/include/ck/library/utility/algorithm.hpp
+43
-0
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+46
-43
No files found.
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
template
<
index_t
Rank
,
index_t
Reduce
>
using
device_softmax_f32_f32_instances
=
std
::
tuple
<
// clang-format off
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
>
,
// fallback kernel
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
4
>
,
// Reduction on middle dimensions
// InSrcVectorDim is 0 since we want to coalesce reads on M dimension
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
8
,
32
,
8
,
4
,
0
,
1
,
1
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
8
,
32
,
8
,
4
,
0
,
4
,
4
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
void
add_device_softmax_i8_i8_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
template
<
index_t
Rank
,
index_t
Reduce
>
using
device_softmax_i8_i8_instances
=
std
::
tuple
<
// clang-format off
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
// fallback kernel
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
16
,
1
,
1
,
1
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
16
,
1
,
16
,
16
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
16
,
1
,
16
,
16
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
16
,
16
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
16
,
16
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
64
,
1
,
16
,
16
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
16
,
16
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
16
,
16
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
64
,
1
,
16
,
16
>
,
// Reduction on middle dimensions
// InSrcVectorDim is 0 since we want to coalesce reads on M dimension
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
8
,
32
,
8
,
8
,
0
,
1
,
1
>
,
DeviceSoftmaxImpl
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
Rank
,
Reduce
,
256
,
32
,
8
,
32
,
8
,
0
,
16
,
8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
library/include/ck/library/utility/algorithm.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <iterator>
#include <type_traits>
#include <utility>
namespace
ck
{
namespace
ranges
{
template
<
typename
InputRange
,
typename
OutputIterator
>
auto
copy
(
InputRange
&&
range
,
OutputIterator
iter
)
->
decltype
(
std
::
copy
(
std
::
begin
(
std
::
forward
<
InputRange
>
(
range
)),
std
::
end
(
std
::
forward
<
InputRange
>
(
range
)),
iter
))
{
return
std
::
copy
(
std
::
begin
(
std
::
forward
<
InputRange
>
(
range
)),
std
::
end
(
std
::
forward
<
InputRange
>
(
range
)),
iter
);
}
template
<
typename
T
,
typename
OutputRange
>
auto
fill
(
OutputRange
&&
range
,
const
T
&
init
)
->
std
::
void_t
<
decltype
(
std
::
fill
(
std
::
begin
(
std
::
forward
<
OutputRange
>
(
range
)),
std
::
end
(
std
::
forward
<
OutputRange
>
(
range
)),
init
))
>
{
std
::
fill
(
std
::
begin
(
std
::
forward
<
OutputRange
>
(
range
)),
std
::
end
(
std
::
forward
<
OutputRange
>
(
range
)),
init
);
}
template
<
typename
InputRange
,
typename
OutputIterator
,
typename
UnaryOperation
>
auto
transform
(
InputRange
&&
range
,
OutputIterator
iter
,
UnaryOperation
unary_op
)
->
decltype
(
std
::
transform
(
std
::
begin
(
range
),
std
::
end
(
range
),
iter
,
unary_op
))
{
return
std
::
transform
(
std
::
begin
(
range
),
std
::
end
(
range
),
iter
,
unary_op
);
}
}
// namespace ranges
}
// namespace ck
library/include/ck/library/utility/check_err.hpp
View file @
05ee41c3
...
@@ -15,18 +15,22 @@
...
@@ -15,18 +15,22 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/span.hpp"
#include "ck/utility/type.hpp"
#include "ck/utility/type.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/ranges.hpp"
namespace
ck
{
namespace
ck
{
namespace
utils
{
namespace
utils
{
template
<
typename
T
>
template
<
typename
Range
,
typename
RefRange
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
half_t
>::
value
,
typename
std
::
enable_if
<
bool
>::
type
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
check_err
(
const
std
::
vector
<
T
>&
out
,
std
::
is_floating_point_v
<
ranges
::
range_value_t
<
Range
>>
&&
const
std
::
vector
<
T
>&
ref
,
!
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
half_t
>
,
bool
>::
type
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-5
,
double
rtol
=
1e-5
,
double
atol
=
3e-6
)
double
atol
=
3e-6
)
...
@@ -44,15 +48,17 @@ check_err(const std::vector<T>& out,
...
@@ -44,15 +48,17 @@ check_err(const std::vector<T>& out,
double
max_err
=
std
::
numeric_limits
<
double
>::
min
();
double
max_err
=
std
::
numeric_limits
<
double
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
{
err
=
std
::
abs
(
out
[
i
]
-
ref
[
i
]);
const
double
o
=
*
std
::
next
(
std
::
begin
(
out
),
i
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
ref
[
i
])
||
!
std
::
isfinite
(
out
[
i
])
||
!
std
::
isfinite
(
ref
[
i
]))
const
double
r
=
*
std
::
next
(
std
::
begin
(
ref
),
i
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
{
max_err
=
err
>
max_err
?
err
:
max_err
;
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
)
{
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
ut
[
i
]
<<
" != "
<<
r
ef
[
i
]
<<
std
::
endl
;
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
}
res
=
false
;
res
=
false
;
}
}
...
@@ -64,10 +70,13 @@ check_err(const std::vector<T>& out,
...
@@ -64,10 +70,13 @@ check_err(const std::vector<T>& out,
return
res
;
return
res
;
}
}
template
<
typename
T
>
template
<
typename
Range
,
typename
RefRange
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
typename
std
::
enable_if
<
check_err
(
const
std
::
vector
<
T
>&
out
,
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
const
std
::
vector
<
T
>&
ref
,
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
bhalf_t
>
,
bool
>::
type
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
double
atol
=
1e-3
)
...
@@ -86,9 +95,9 @@ check_err(const std::vector<T>& out,
...
@@ -86,9 +95,9 @@ check_err(const std::vector<T>& out,
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
{
double
o
=
type_convert
<
float
>
(
out
[
i
]
);
const
double
o
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
out
),
i
)
);
double
r
=
type_convert
<
float
>
(
ref
[
i
]
);
const
double
r
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
ref
),
i
)
);
err
=
std
::
abs
(
o
-
r
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
{
max_err
=
err
>
max_err
?
err
:
max_err
;
max_err
=
err
>
max_err
?
err
:
max_err
;
...
@@ -108,10 +117,13 @@ check_err(const std::vector<T>& out,
...
@@ -108,10 +117,13 @@ check_err(const std::vector<T>& out,
return
res
;
return
res
;
}
}
template
<
typename
T
>
template
<
typename
Range
,
typename
RefRange
>
typename
std
::
enable_if
<
std
::
is_same_v
<
T
,
half_t
>
,
bool
>::
type
typename
std
::
enable_if
<
check_err
(
span
<
const
T
>
out
,
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
span
<
const
T
>
ref
,
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
half_t
>
,
bool
>::
type
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
double
atol
=
1e-3
)
...
@@ -126,12 +138,12 @@ check_err(span<const T> out,
...
@@ -126,12 +138,12 @@ check_err(span<const T> out,
bool
res
{
true
};
bool
res
{
true
};
int
err_count
=
0
;
int
err_count
=
0
;
double
err
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
T
>::
min
();
double
max_err
=
std
::
numeric_limits
<
ranges
::
range_value_t
<
Range
>
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
{
double
o
=
type_convert
<
float
>
(
out
[
i
]
);
const
double
o
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
out
),
i
)
);
double
r
=
type_convert
<
float
>
(
ref
[
i
]
);
const
double
r
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
ref
),
i
)
);
err
=
std
::
abs
(
o
-
r
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
{
max_err
=
err
>
max_err
?
err
:
max_err
;
max_err
=
err
>
max_err
?
err
:
max_err
;
...
@@ -151,26 +163,17 @@ check_err(span<const T> out,
...
@@ -151,26 +163,17 @@ check_err(span<const T> out,
return
res
;
return
res
;
}
}
template
<
typename
T
>
template
<
typename
Range
,
typename
RefRange
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
std
::
enable_if_t
<
(
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
check_err
(
const
std
::
vector
<
T
>&
out
,
std
::
is_integral_v
<
ranges
::
range_value_t
<
Range
>>
&&
const
std
::
vector
<
T
>&
ref
,
!
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
bhalf_t
>
)
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
return
check_err
(
span
<
const
T
>
{
out
},
span
<
const
T
>
{
ref
},
msg
,
rtol
,
atol
);
}
template
<
typename
T
>
std
::
enable_if_t
<
(
std
::
is_integral_v
<
T
>
&&
!
std
::
is_same_v
<
T
,
bhalf_t
>
)
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
||
std
::
is_same_v
<
T
,
int4_t
>
||
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
int4_t
>
#endif
#endif
,
,
bool
>
bool
>
check_err
(
const
std
::
vector
<
T
>
&
out
,
check_err
(
const
Range
&
out
,
const
std
::
vector
<
T
>
&
ref
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
=
0
,
double
=
0
,
double
atol
=
0
)
double
atol
=
0
)
...
@@ -188,9 +191,9 @@ check_err(const std::vector<T>& out,
...
@@ -188,9 +191,9 @@ check_err(const std::vector<T>& out,
int64_t
max_err
=
std
::
numeric_limits
<
int64_t
>::
min
();
int64_t
max_err
=
std
::
numeric_limits
<
int64_t
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
{
int64_t
o
=
out
[
i
]
;
const
int64_t
o
=
*
std
::
next
(
std
::
begin
(
out
),
i
)
;
int64_t
r
=
ref
[
i
]
;
const
int64_t
r
=
*
std
::
next
(
std
::
begin
(
ref
),
i
)
;
err
=
std
::
abs
(
o
-
r
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
)
if
(
err
>
atol
)
{
{
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment