Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
4100d1d8
Commit
4100d1d8
authored
Aug 23, 2023
by
Alan Turner
Browse files
Merge remote-tracking branch 'origin/develop' into migx-flash-attn
parents
48717006
c8a8385f
Changes
609
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
236 additions
and
359 deletions
+236
-359
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
..._instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+0
-40
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
...on_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+0
-40
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+0
-27
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+0
-27
library/src/utility/device_memory.cpp
library/src/utility/device_memory.cpp
+41
-4
profiler/README.md
profiler/README.md
+81
-0
profiler/include/profiler/profile_gemm_splitk_impl.hpp
profiler/include/profiler/profile_gemm_splitk_impl.hpp
+90
-54
No files found.
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
4
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
4
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f32_f32_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_f32_f32_rank3_reduce1_instances
(
instances
);
add_device_softmax_f32_f32_rank3_reduce2_instances
(
instances
);
add_device_softmax_f32_f32_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_f32_f32_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_f32_f32_rank4_reduce1_instances
(
instances
);
add_device_softmax_f32_f32_rank4_reduce2_instances
(
instances
);
add_device_softmax_f32_f32_rank4_reduce3_instances
(
instances
);
add_device_softmax_f32_f32_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
,
1
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
3
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
1
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
,
2
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
3
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
2
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
,
3
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
3
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
3
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
1
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
1
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
2
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
2
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
3
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
3
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
View file @
4100d1d8
...
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
,
4
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_generic_instance
<
4
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
4
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_i8_i8_rank3_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_i8_i8_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_i8_i8_rank4_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce3_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/utility/device_memory.cpp
View file @
4100d1d8
...
...
@@ -10,20 +10,57 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
hip_check_error
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
void
DeviceMem
::
Realloc
(
std
::
size_t
mem_size
)
{
if
(
mpDeviceBuf
)
{
hip_check_error
(
hipFree
(
mpDeviceBuf
));
}
mMemSize
=
mem_size
;
hip_check_error
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
void
*
DeviceMem
::
GetDeviceBuffer
()
const
{
return
mpDeviceBuf
;
}
std
::
size_t
DeviceMem
::
GetBufferSize
()
const
{
return
mMemSize
;
}
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
const
{
hip_check_error
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
if
(
mpDeviceBuf
)
{
hip_check_error
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
}
else
{
throw
std
::
runtime_error
(
"ToDevice with an empty pointer"
);
}
}
void
DeviceMem
::
FromDevice
(
void
*
p
)
const
{
hip_check_error
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
if
(
mpDeviceBuf
)
{
hip_check_error
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
}
else
{
throw
std
::
runtime_error
(
"FromDevice with an empty pointer"
);
}
}
void
DeviceMem
::
SetZero
()
const
{
hip_check_error
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
void
DeviceMem
::
SetZero
()
const
{
if
(
mpDeviceBuf
)
{
hip_check_error
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
}
DeviceMem
::~
DeviceMem
()
{
hip_check_error
(
hipFree
(
mpDeviceBuf
));
}
DeviceMem
::~
DeviceMem
()
{
if
(
mpDeviceBuf
)
{
hip_check_error
(
hipFree
(
mpDeviceBuf
));
}
}
profiler/README.md
View file @
4100d1d8
...
...
@@ -102,4 +102,85 @@ arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
arg.e_grid_desc_m_n_
{
4096, 4096
}
....
Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
## Profile grouped convolution backward data kernels
```
bash
# arg1: tensor operation (grouped_conv_bwd_data: Grouped Convolution Backward Data)
# arg2: data type (0: Output fp32, Weight fp32, Input fp32
# 1: Output fp16, Weight fp16, Input fp16
# 2: Output bf16, Weight bf16, Input bf16
# arg3: tensor layout (0: Output[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Input[G, N, Ho, Wo, K]
# 1: Output[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Input[N, Ho, Wo, G, K])
# arg4: verification (0: no, 1: yes)
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
# <strides>, (ie Sy, Sx for 2D)
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler grouped_conv_bwd_data 1 0 1 1 0 1 2 32 4 192 192 3 3 28 28 1 1 1 1 1 1 1 1
```
Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
```
out: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
wei: dim 5, lengths {32, 192, 192, 3, 3}, strides {331776, 1728, 1, 576, 192}
in: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
....
Best configuration parameters:
name: DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 2, Default, 32, 32, 2, 4, 8, 4, 1, 1>
avg_time: 0.768321
tflops: 86.6679
GB/s: 127.947
```
## Profile grouped convolution backward weight kernels
```bash
# arg1: tensor operation (grouped_conv_bwd_weight: Grouped Convolution Backward Weight)
# arg2: data type (0: Input fp32, Weight fp32, Output fp32
# 1: Input fp16, Weight fp16, Output fp16
# 2: Input bf16, Weight fp32, Output bf16)
# arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, N, K, Ho, Wo]
# 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]
# 2: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]
# arg4: verification (0: no, 1: yes)
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
# <strides>, (ie Sy, Sx for 2D)
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
# SplitK
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx SplitK
./bin/ckProfiler grouped_conv_bwd_weight 1 0 1 1 0 1 2 32 256 256 512 3 3 28 28 1 1 1 1 1 0 0 0 1
```
Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
```
input: dim 5, lengths {32, 512, 1024, 28, 28}, strides {411041792, 802816, 1, 28672, 1024}
weight: dim 5, lengths {32, 512, 1024, 3, 3}, strides {4718592, 9216, 1, 3072, 1024}
output: dim 5, lengths {32, 512, 512, 26, 26}, strides {177209344, 346112, 1, 13312, 512}
....
Best configuration parameters:
name: DeviceGroupedConvBwdWeight_Xdl_CShuffle<256, 256, 128, 4, Default, 8, 4, 2, 8, 4, 8, 2, 1, 1, 8>
avg_time: 68.5216
tflops: 95.337
GB/s: 69.2301
```
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
profiler/include/profiler/profile_gemm_splitk_impl.hpp
View file @
4100d1d8
...
...
@@ -94,7 +94,6 @@ bool profile_gemm_splitk_impl(int do_verification,
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
c_device_buf
.
SetZero
();
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmSplitK
<
ALayout
,
BLayout
,
...
...
@@ -136,77 +135,114 @@ bool profile_gemm_splitk_impl(int do_verification,
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
float
best_kbatch
=
0
;
// profile device GEMM instances
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
,
KBatch
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
std
::
vector
<
int
>
kbatch_list
=
{
1
,
2
,
4
,
8
,
12
,
16
,
20
,
24
,
32
,
36
,
40
,
60
,
64
,
72
,
80
,
88
,
96
,
128
,
144
,
160
,
176
,
192
,
256
};
if
(
KBatch
>
0
)
{
// re-init C to zero before profiling next kernel
c_device_buf
.
SetZero
();
kbatch_list
=
{
KBatch
};
}
for
(
std
::
size_t
i
=
0
;
i
<
kbatch_list
.
size
();
i
++
)
{
auto
kbatch_curr
=
kbatch_list
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
,
kbatch_curr
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
// re-init C to zero before profiling next kernel
c_device_buf
.
SetZero
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a_m_k
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b_k_n
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host : "
,
c_m_n_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_device: "
,
c_m_n_device_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
if
(
tflops
>
best_tflops
)
{
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
pass
=
pass
&
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
", KBatch "
<<
kbatch_curr
<<
std
::
endl
;
if
(
do_log
)
// set softer tolerances for fp8
if
constexpr
(
is_same_v
<
ADataType
,
f8_t
>
||
is_same_v
<
BDataType
,
f8_t
>
||
is_same_v
<
CDataType
,
f8_t
>
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a_m_k
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b_k_n
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host : "
,
c_m_n_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_device: "
,
c_m_n_device_result
.
mData
,
","
)
<<
std
::
endl
;
std
::
string
msg
=
"Error: Incorrect results!"
;
double
rtol
=
1e-1
;
double
atol
=
1e-1
;
pass
=
pass
&
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
,
msg
,
rtol
,
atol
);
}
else
{
pass
=
pass
&
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
}
if
(
tflops
>
best_tflops
)
{
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
best_kbatch
=
kbatch_curr
;
}
}
}
else
{
std
::
cout
<<
op_ptr
->
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
else
{
std
::
cout
<<
op_ptr
->
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
}
}
}
...
...
@@ -246,7 +282,7 @@ bool profile_gemm_splitk_impl(int do_verification,
}
std
::
cout
<<
" M = "
<<
M
<<
" N = "
<<
N
<<
" K = "
<<
K
<<
" StrideA = "
<<
StrideA
<<
" StrideB = "
<<
StrideB
<<
" StrideC = "
<<
StrideC
<<
" KBatch = "
<<
KB
atch
<<
" StrideB = "
<<
StrideB
<<
" StrideC = "
<<
StrideC
<<
" KBatch = "
<<
best_kb
atch
<<
" : "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
...
...
Prev
1
…
23
24
25
26
27
28
29
30
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment