Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
72e0c1c5
Commit
72e0c1c5
authored
Jun 19, 2023
by
Rostyslav Geyyer
Browse files
Merge branch 'develop' into lwpck-739
parents
898866e0
f0c620c4
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
321 additions
and
344 deletions
+321
-344
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
..._instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+0
-40
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
...on_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+0
-40
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+0
-27
profiler/include/profiler/profile_softmax_impl.hpp
profiler/include/profiler/profile_softmax_impl.hpp
+19
-11
profiler/src/profile_softmax.cpp
profiler/src/profile_softmax.cpp
+155
-36
test/data_type/CMakeLists.txt
test/data_type/CMakeLists.txt
+3
-0
test/data_type/fp8.cpp
test/data_type/fp8.cpp
+123
-0
No files found.
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_f32_f32_rank3_reduce1_instances
(
instances
);
add_device_softmax_f32_f32_rank3_reduce2_instances
(
instances
);
add_device_softmax_f32_f32_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_f32_f32_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_f32_f32_rank4_reduce1_instances
(
instances
);
add_device_softmax_f32_f32_rank4_reduce2_instances
(
instances
);
add_device_softmax_f32_f32_rank4_reduce3_instances
(
instances
);
add_device_softmax_f32_f32_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce1_instances
(
void
add_device_softmax_f32_f32_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
,
1
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
3
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
1
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce2_instances
(
void
add_device_softmax_f32_f32_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
,
2
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
3
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
2
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce3_instances
(
void
add_device_softmax_f32_f32_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
3
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce1_instances
(
void
add_device_softmax_f32_f32_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
1
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
1
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce2_instances
(
void
add_device_softmax_f32_f32_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
2
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
2
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce3_instances
(
void
add_device_softmax_f32_f32_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce4_instances
(
void
add_device_softmax_f32_f32_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
4
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
4
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_i8_i8_rank3_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_i8_i8_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_i8_i8_rank4_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce3_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profiler/profile_softmax_impl.hpp
View file @
72e0c1c5
...
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>() { return "int8"; }
...
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>() { return "int8"; }
template
<
>
std
::
string
type_to_string
<
int32_t
>
()
{
return
"int32"
;
}
template
<
>
std
::
string
type_to_string
<
int32_t
>
()
{
return
"int32"
;
}
// clang-format on
// clang-format on
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
>
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
>
bool
profile_softmax_impl
(
int
do_verification
,
bool
profile_softmax_impl
(
int
do_verification
,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_log
,
...
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
...
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
if
(
Rank
!=
in_length
.
size
())
if
(
Rank
!=
in_length
.
size
())
{
{
throw
std
::
runtime_error
(
"Input tensor rank is different from template argument Rank!"
);
throw
std
::
runtime_error
(
"Input tensor rank is different from template argument Rank!"
);
}
};
if
(
NumReduceDim
!=
reduce_dims
.
size
())
{
throw
std
::
runtime_error
(
"Input reduce_dims rank is different from template argument NumReduceDim!"
);
};
Tensor
<
InDataType
>
in
=
in_strides
.
empty
()
?
Tensor
<
InDataType
>
(
in_length
)
Tensor
<
InDataType
>
in
=
in_strides
.
empty
()
?
Tensor
<
InDataType
>
(
in_length
)
:
Tensor
<
InDataType
>
(
in_length
,
in_strides
);
:
Tensor
<
InDataType
>
(
in_length
,
in_strides
);
...
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
...
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
// add device softmax instances
// add device softmax instances
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceOp
=
tensor_operation
::
device
::
using
DeviceOp
=
tensor_operation
::
device
::
DeviceSoftmax
<
InDataType
,
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
Rank
>
;
AccDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
Rank
,
NumReduceDim
>
;
// get device op instances
// get device op instances
const
auto
instances
=
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
const
auto
instances
=
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
...
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
for
(
auto
&
inst_ptr
:
instances
)
for
(
auto
&
inst_ptr
:
instances
)
{
{
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
// problem to rank 4 kernel) other than invoking IsSupportedArgument()?
if
(
!
(
inst_ptr
->
GetNumReduceDim
()
==
static_cast
<
index_t
>
(
reduce_dims
.
size
())))
{
continue
;
}
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
in_tensor_lengths
,
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
in_tensor_lengths
,
in_tensor_strides
,
in_tensor_strides
,
reduce_dims
,
reduce_dims
,
...
...
profiler/src/profile_softmax.cpp
View file @
72e0c1c5
...
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
...
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
{
{
if
(
data_type
==
SoftmaxDataType
::
F16_F16
)
if
(
data_type
==
SoftmaxDataType
::
F16_F16
)
{
{
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
3
>
(
do_verification
,
if
(
reduce
.
size
()
==
1
)
init_method
,
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
3
,
1
>
(
do_log
,
do_verification
,
time_kernel
,
init_method
,
length
,
do_log
,
stride
,
time_kernel
,
reduce
,
length
,
double
(
alpha
),
stride
,
double
(
beta
));
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
2
)
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
3
,
2
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
3
)
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
3
,
3
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
throw
std
::
runtime_error
(
"invalid number of dimensions to reduce"
);
}
}
else
if
(
data_type
==
SoftmaxDataType
::
F32_F32
)
else
if
(
data_type
==
SoftmaxDataType
::
F32_F32
)
{
{
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
3
>
(
do_verification
,
if
(
reduce
.
size
()
==
1
)
init_method
,
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
3
,
1
>
(
do_verification
,
do_log
,
init_method
,
time_kernel
,
do_log
,
length
,
time_kernel
,
stride
,
length
,
reduce
,
stride
,
double
(
alpha
),
reduce
,
double
(
beta
));
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
2
)
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
3
,
2
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
3
)
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
3
,
3
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
throw
std
::
runtime_error
(
"invalid number of dimensions to reduce"
);
}
}
else
else
{
{
...
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
...
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
{
{
if
(
data_type
==
SoftmaxDataType
::
F16_F16
)
if
(
data_type
==
SoftmaxDataType
::
F16_F16
)
{
{
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
4
>
(
do_verification
,
if
(
reduce
.
size
()
==
1
)
init_method
,
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
4
,
1
>
(
do_log
,
do_verification
,
time_kernel
,
init_method
,
length
,
do_log
,
stride
,
time_kernel
,
reduce
,
length
,
double
(
alpha
),
stride
,
double
(
beta
));
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
2
)
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
4
,
2
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
3
)
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
4
,
3
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
4
)
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
4
,
4
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
throw
std
::
runtime_error
(
"invalid number of dimensions to reduce"
);
}
}
else
if
(
data_type
==
SoftmaxDataType
::
F32_F32
)
else
if
(
data_type
==
SoftmaxDataType
::
F32_F32
)
{
{
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
4
>
(
do_verification
,
if
(
reduce
.
size
()
==
1
)
init_method
,
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
4
,
1
>
(
do_verification
,
do_log
,
init_method
,
time_kernel
,
do_log
,
length
,
time_kernel
,
stride
,
length
,
reduce
,
stride
,
double
(
alpha
),
reduce
,
double
(
beta
));
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
2
)
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
4
,
2
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
3
)
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
4
,
3
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
if
(
reduce
.
size
()
==
4
)
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
4
,
4
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
double
(
alpha
),
double
(
beta
));
else
throw
std
::
runtime_error
(
"invalid number of dimensions to reduce"
);
}
}
else
else
{
{
...
...
test/data_type/CMakeLists.txt
View file @
72e0c1c5
...
@@ -2,3 +2,6 @@ if (USE_BITINT_EXTENSION_INT4)
...
@@ -2,3 +2,6 @@ if (USE_BITINT_EXTENSION_INT4)
add_gtest_executable
(
test_int4 int4.cpp
)
add_gtest_executable
(
test_int4 int4.cpp
)
target_link_libraries
(
test_int4 PRIVATE utility
)
target_link_libraries
(
test_int4 PRIVATE utility
)
endif
()
endif
()
add_gtest_executable
(
test_fp8 fp8.cpp
)
target_link_libraries
(
test_fp8 PRIVATE utility
)
test/data_type/fp8.cpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
using
ck
::
f8_convert_sr
;
using
ck
::
f8_t
;
using
ck
::
half_t
;
using
ck
::
type_convert
;
TEST
(
FP8
,
NumericLimits
)
{
EXPECT_EQ
(
ck
::
NumericLimits
<
f8_t
>::
Min
(),
0x08
);
EXPECT_EQ
(
ck
::
NumericLimits
<
f8_t
>::
Max
(),
0x77
);
EXPECT_EQ
(
ck
::
NumericLimits
<
f8_t
>::
Lowest
(),
0xF7
);
EXPECT_EQ
(
ck
::
NumericLimits
<
f8_t
>::
QuietNaN
(),
0x80
);
}
TEST
(
FP8
,
ConvertFP32Nearest
)
{
// fix the tolerance value
float
abs_tol
=
1e-6
;
// convert 0 float to fp8 and back, check if holds
ASSERT_NEAR
(
0.0
f
,
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
0.0
f
)),
abs_tol
);
// convert minimal float to fp8 and back, check if holds
ASSERT_NEAR
(
std
::
numeric_limits
<
float
>::
min
(),
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
std
::
numeric_limits
<
float
>::
min
())),
abs_tol
);
// convert maximal f8_t to float and check if equal to 240.0
ASSERT_NEAR
(
240.0
f
,
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
240.0
f
)),
abs_tol
);
// convert maximal float to fp8 and back, check if clipped to 240.0
ASSERT_NEAR
(
240.0
f
,
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
std
::
numeric_limits
<
float
>::
max
())),
abs_tol
);
// convert inf float to f8_t and check if it is qNan
ASSERT_NEAR
(
0x80
,
type_convert
<
f8_t
>
(
std
::
numeric_limits
<
float
>::
infinity
()),
abs_tol
);
// positive float value to fp8 and back, check if holds
float
pos_float
=
0.0078125
f
;
ASSERT_NEAR
(
pos_float
,
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
pos_float
)),
abs_tol
);
// negative float value to fp8 and back, check if holds
float
neg_float
=
-
0.0156250
f
;
ASSERT_NEAR
(
neg_float
,
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
neg_float
)),
abs_tol
);
}
TEST
(
FP8
,
ConvertFP32Stochastic
)
{
// fix the tolerance value
float
abs_tol
=
1e-6
;
// convert 0 float to fp8 and back, check if holds
ASSERT_NEAR
(
0.0
f
,
type_convert
<
float
>
(
f8_convert_sr
<
f8_t
>
(
0.0
f
)),
abs_tol
);
// convert minimal float to fp8 and back, check if holds
ASSERT_NEAR
(
std
::
numeric_limits
<
float
>::
min
(),
type_convert
<
float
>
(
f8_convert_sr
<
f8_t
>
(
std
::
numeric_limits
<
float
>::
min
())),
abs_tol
);
// convert maximal f8_t to float and check if equal to 240.0
ASSERT_NEAR
(
240.0
f
,
type_convert
<
float
>
(
f8_convert_sr
<
f8_t
>
(
240.0
f
)),
abs_tol
);
// convert maximal float to fp8 and back, check if clipped to 240.0
ASSERT_NEAR
(
240.0
f
,
type_convert
<
float
>
(
f8_convert_sr
<
f8_t
>
(
std
::
numeric_limits
<
float
>::
max
())),
abs_tol
);
// convert inf float to f8_t and check if it is qNan
ASSERT_NEAR
(
0x80
,
f8_convert_sr
<
f8_t
>
(
std
::
numeric_limits
<
float
>::
infinity
()),
abs_tol
);
// positive float value to fp8 and back, check if holds
float
pos_float
=
0.0078125
f
;
ASSERT_NEAR
(
pos_float
,
type_convert
<
float
>
(
f8_convert_sr
<
f8_t
>
(
pos_float
)),
abs_tol
);
// negative float value to fp8 and back, check if holds
float
neg_float
=
-
0.0156250
f
;
ASSERT_NEAR
(
neg_float
,
type_convert
<
float
>
(
f8_convert_sr
<
f8_t
>
(
neg_float
)),
abs_tol
);
}
TEST
(
FP8
,
ConvertFP16Nearest
)
{
// fix the tolerance value
float
abs_tol
=
1e-3
;
// convert 0 fp16 to fp8 and back, check if holds
ASSERT_NEAR
(
half_t
{
0.0
},
type_convert
<
half_t
>
(
type_convert
<
f8_t
>
(
half_t
{
0.0
})),
abs_tol
);
// convert minimal fp16 to fp8 and back, check if holds
ASSERT_NEAR
(
ck
::
NumericLimits
<
half_t
>::
Min
(),
type_convert
<
half_t
>
(
type_convert
<
f8_t
>
(
ck
::
NumericLimits
<
half_t
>::
Min
())),
abs_tol
);
// convert maximal f8_t to fp16 and check if equal to 240.0
ASSERT_NEAR
(
half_t
{
240.0
},
type_convert
<
half_t
>
(
type_convert
<
f8_t
>
(
half_t
{
240.0
})),
abs_tol
);
// convert maximal fp16 to fp8 and back, check if clipped to 240.0
ASSERT_NEAR
(
half_t
{
240.0
},
type_convert
<
half_t
>
(
type_convert
<
f8_t
>
(
ck
::
NumericLimits
<
half_t
>::
Max
())),
abs_tol
);
// convert QuietNaN fp16 to f8_t and check if it is QuietNaN
ASSERT_NEAR
(
0x80
,
type_convert
<
f8_t
>
(
ck
::
NumericLimits
<
half_t
>::
QuietNaN
()),
abs_tol
);
// positive fp16 value to fp8 and back, check if holds
half_t
pos_half
=
half_t
{
0.0078125
};
ASSERT_NEAR
(
pos_half
,
type_convert
<
half_t
>
(
type_convert
<
f8_t
>
(
pos_half
)),
abs_tol
);
// negative fp16 value to fp8 and back, check if holds
half_t
neg_half
=
half_t
{
-
0.0156250
};
ASSERT_NEAR
(
neg_half
,
type_convert
<
half_t
>
(
type_convert
<
f8_t
>
(
neg_half
)),
abs_tol
);
}
TEST
(
FP8
,
ConvertFP16Stochastic
)
{
// fix the tolerance value
float
abs_tol
=
1e-3
;
// convert 0 fp16 to fp8 and back, check if holds
ASSERT_NEAR
(
half_t
{
0.0
},
type_convert
<
half_t
>
(
f8_convert_sr
<
f8_t
>
(
half_t
{
0.0
})),
abs_tol
);
// convert minimal fp16 to fp8 and back, check if holds
ASSERT_NEAR
(
ck
::
NumericLimits
<
half_t
>::
Min
(),
type_convert
<
half_t
>
(
f8_convert_sr
<
f8_t
>
(
ck
::
NumericLimits
<
half_t
>::
Min
())),
abs_tol
);
// convert maximal f8_t to fp16 and check if equal to 240.0
ASSERT_NEAR
(
half_t
{
240.0
},
type_convert
<
half_t
>
(
f8_convert_sr
<
f8_t
>
(
half_t
{
240.0
})),
abs_tol
);
// convert maximal fp16 to fp8 and back, check if clipped to 240.0
ASSERT_NEAR
(
half_t
{
240.0
},
type_convert
<
half_t
>
(
f8_convert_sr
<
f8_t
>
(
ck
::
NumericLimits
<
half_t
>::
Max
())),
abs_tol
);
// convert QuietNaN fp16 to f8_t and check if it is QuietNaN
ASSERT_NEAR
(
0x80
,
f8_convert_sr
<
f8_t
>
(
ck
::
NumericLimits
<
half_t
>::
QuietNaN
()),
abs_tol
);
// positive fp16 value to fp8 and back, check if holds
half_t
pos_half
=
half_t
{
0.0078125
};
ASSERT_NEAR
(
pos_half
,
type_convert
<
half_t
>
(
f8_convert_sr
<
f8_t
>
(
pos_half
)),
abs_tol
);
// negative fp16 value to fp8 and back, check if holds
half_t
neg_half
=
half_t
{
-
0.0156250
};
ASSERT_NEAR
(
neg_half
,
type_convert
<
half_t
>
(
f8_convert_sr
<
f8_t
>
(
neg_half
)),
abs_tol
);
}
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment