Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
1b616990
Commit
1b616990
authored
Feb 05, 2025
by
aska-0096
Browse files
Merge branch 'develop' of
https://github.com/ROCm/composable_kernel
into update_cka8w8_uc
parents
af30d6b6
800cf897
Changes
553
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
685 additions
and
129 deletions
+685
-129
include/ck_tile/core/utility/amd_address_space.hpp
include/ck_tile/core/utility/amd_address_space.hpp
+0
-37
include/ck_tile/core/utility/type_traits.hpp
include/ck_tile/core/utility/type_traits.hpp
+18
-0
include/ck_tile/core/utility/unary_element_function.hpp
include/ck_tile/core/utility/unary_element_function.hpp
+9
-7
include/ck_tile/host.hpp
include/ck_tile/host.hpp
+2
-1
include/ck_tile/host/check_err.hpp
include/ck_tile/host/check_err.hpp
+112
-2
include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
...k_tile/host/convolution_host_tensor_descriptor_helper.hpp
+27
-57
include/ck_tile/host/host_tensor.hpp
include/ck_tile/host/host_tensor.hpp
+40
-1
include/ck_tile/host/reference/reference_batched_transpose.hpp
...de/ck_tile/host/reference/reference_batched_transpose.hpp
+59
-0
include/ck_tile/host/reference/reference_fused_moe.hpp
include/ck_tile/host/reference/reference_fused_moe.hpp
+25
-16
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+30
-4
include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+1
-1
include/ck_tile/ops/batched_transpose.hpp
include/ck_tile/ops/batched_transpose.hpp
+11
-0
include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
...ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+129
-0
include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
...batched_transpose/pipeline/batched_transpose_pipeline.hpp
+52
-0
include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
...s/batched_transpose/pipeline/batched_transpose_policy.hpp
+44
-0
include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
.../batched_transpose/pipeline/batched_transpose_problem.hpp
+48
-0
include/ck_tile/ops/common.hpp
include/ck_tile/ops/common.hpp
+1
-1
include/ck_tile/ops/elementwise.hpp
include/ck_tile/ops/elementwise.hpp
+1
-1
include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
.../ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+75
-0
include/ck_tile/ops/epilogue.hpp
include/ck_tile/ops/epilogue.hpp
+1
-1
No files found.
Too many changes to show.
To preserve performance only
553 of 553+
files are displayed.
Plain diff
Email patch
include/ck_tile/core/utility/amd_address_space.hpp
deleted
100644 → 0
View file @
af30d6b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
// Address Space for AMDGCN
// https://llvm.org/docs/AMDGPUUsage.html#address-space
namespace
ck_tile
{
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
template
<
typename
T
>
__device__
T
*
cast_pointer_to_generic_address_space
(
T
CK_CONSTANT_ADDRESS_SPACE
*
p
)
{
// cast a pointer in "Constant" address space (4) to "Generic" address space (0)
// only c-style pointer cast seems be able to be compiled
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return
(
T
*
)
p
;
// NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
template
<
typename
T
>
__host__
__device__
T
CK_CONSTANT_ADDRESS_SPACE
*
cast_pointer_to_constant_address_space
(
T
*
p
)
{
// cast a pointer in "Generic" address space (0) to "Constant" address space (4)
// only c-style pointer cast seems be able to be compiled
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return
(
T
CK_CONSTANT_ADDRESS_SPACE
*
)
p
;
// NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
}
// namespace ck_tile
include/ck_tile/core/utility/type_traits.hpp
View file @
1b616990
...
@@ -109,4 +109,22 @@ CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x)
...
@@ -109,4 +109,22 @@ CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x)
#pragma clang diagnostic pop
#pragma clang diagnostic pop
}
}
template
<
typename
CompareTo
,
typename
...
Rest
>
struct
is_any_of
:
std
::
false_type
{
};
template
<
typename
CompareTo
,
typename
FirstType
>
struct
is_any_of
<
CompareTo
,
FirstType
>
:
std
::
is_same
<
CompareTo
,
FirstType
>
{
};
template
<
typename
CompareTo
,
typename
FirstType
,
typename
...
Rest
>
struct
is_any_of
<
CompareTo
,
FirstType
,
Rest
...
>
:
std
::
integral_constant
<
bool
,
std
::
is_same
<
CompareTo
,
FirstType
>::
value
||
is_any_of
<
CompareTo
,
Rest
...
>::
value
>
{
};
}
// namespace ck_tile
}
// namespace ck_tile
include/ck_tile/core/utility/unary_element_function.hpp
View file @
1b616990
...
@@ -51,16 +51,18 @@ struct composes<F>
...
@@ -51,16 +51,18 @@ struct composes<F>
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
composes
(
Ts
&&
...)
->
composes
<
remove_cvref_t
<
Ts
>
...
>
;
__host__
__device__
composes
(
Ts
&&
...)
->
composes
<
remove_cvref_t
<
Ts
>
...
>
;
template
<
typename
To
>
template
<
typename
SaturateType
>
struct
saturates
struct
saturates
{
{
template
<
typename
From
>
// NOTE: this function does not return SaturateType value
CK_TILE_HOST_DEVICE
constexpr
auto
operator
()(
const
From
&
from
)
const
// it is user's responsiblity to do further cast or not
->
std
::
enable_if_t
<
std
::
is_arithmetic_v
<
From
>
,
From
>
template
<
typename
AccType
>
CK_TILE_HOST_DEVICE
constexpr
auto
operator
()(
const
AccType
&
a_
)
const
->
std
::
enable_if_t
<
std
::
is_arithmetic_v
<
AccType
>
,
AccType
>
{
{
return
clamp
(
from
,
return
clamp
(
a_
,
type_convert
<
From
>
(
numeric
<
To
>::
lowest
()),
type_convert
<
AccType
>
(
numeric
<
SaturateType
>::
lowest
()),
type_convert
<
From
>
(
numeric
<
To
>::
max
()));
type_convert
<
AccType
>
(
numeric
<
SaturateType
>::
max
()));
}
}
};
};
...
...
include/ck_tile/host.hpp
View file @
1b616990
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -34,3 +34,4 @@
...
@@ -34,3 +34,4 @@
#include "ck_tile/host/reference/reference_topk.hpp"
#include "ck_tile/host/reference/reference_topk.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/timer.hpp"
#include "ck_tile/host/timer.hpp"
#include "ck_tile/host/reference/reference_batched_transpose.hpp"
include/ck_tile/host/check_err.hpp
View file @
1b616990
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -18,6 +18,112 @@
...
@@ -18,6 +18,112 @@
namespace
ck_tile
{
namespace
ck_tile
{
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
double
get_relative_threshold
(
const
int
number_of_accumulations
=
1
)
{
using
F8
=
ck_tile
::
fp8_t
;
using
F16
=
ck_tile
::
half_t
;
using
BF16
=
ck_tile
::
bf16_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
static_assert
(
is_any_of
<
ComputeDataType
,
F8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!"
);
double
compute_error
=
0
;
if
constexpr
(
is_any_of
<
ComputeDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
compute_error
=
std
::
pow
(
2
,
-
numeric_traits
<
ComputeDataType
>::
mant
)
*
0.5
;
}
static_assert
(
is_any_of
<
OutDataType
,
F8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled OutDataType for setting up the relative threshold!"
);
double
output_error
=
0
;
if
constexpr
(
is_any_of
<
OutDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
output_error
=
std
::
pow
(
2
,
-
numeric_traits
<
OutDataType
>::
mant
)
*
0.5
;
}
double
midway_error
=
std
::
max
(
compute_error
,
output_error
);
static_assert
(
is_any_of
<
AccDataType
,
F8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled AccDataType for setting up the relative threshold!"
);
double
acc_error
=
0
;
if
constexpr
(
is_any_of
<
AccDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
acc_error
=
std
::
pow
(
2
,
-
numeric_traits
<
AccDataType
>::
mant
)
*
0.5
*
number_of_accumulations
;
}
return
std
::
max
(
acc_error
,
midway_error
);
}
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
double
get_absolute_threshold
(
const
double
max_possible_num
,
const
int
number_of_accumulations
=
1
)
{
using
F8
=
ck_tile
::
fp8_t
;
using
F16
=
ck_tile
::
half_t
;
using
BF16
=
ck_tile
::
bf16_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
static_assert
(
is_any_of
<
ComputeDataType
,
F8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!"
);
auto
expo
=
std
::
log2
(
std
::
abs
(
max_possible_num
));
double
compute_error
=
0
;
if
constexpr
(
is_any_of
<
ComputeDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
compute_error
=
std
::
pow
(
2
,
expo
-
numeric_traits
<
ComputeDataType
>::
mant
)
*
0.5
;
}
static_assert
(
is_any_of
<
OutDataType
,
F8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled OutDataType for setting up the absolute threshold!"
);
double
output_error
=
0
;
if
constexpr
(
is_any_of
<
OutDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
output_error
=
std
::
pow
(
2
,
expo
-
numeric_traits
<
OutDataType
>::
mant
)
*
0.5
;
}
double
midway_error
=
std
::
max
(
compute_error
,
output_error
);
static_assert
(
is_any_of
<
AccDataType
,
F8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled AccDataType for setting up the absolute threshold!"
);
double
acc_error
=
0
;
if
constexpr
(
is_any_of
<
AccDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
acc_error
=
std
::
pow
(
2
,
expo
-
numeric_traits
<
AccDataType
>::
mant
)
*
0.5
*
number_of_accumulations
;
}
return
std
::
max
(
acc_error
,
midway_error
);
}
template
<
typename
T
>
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
{
{
...
@@ -337,7 +443,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
...
@@ -337,7 +443,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
}
}
if
(
!
res
)
if
(
!
res
)
{
{
std
::
cerr
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
const
float
error_percent
=
static_cast
<
float
>
(
err_count
)
/
static_cast
<
float
>
(
out
.
size
())
*
100.
f
;
std
::
cerr
<<
"max err: "
<<
max_err
;
std
::
cerr
<<
", number of errors: "
<<
err_count
;
std
::
cerr
<<
", "
<<
error_percent
<<
"% wrong values"
<<
std
::
endl
;
}
}
return
res
;
return
res
;
}
}
...
...
include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
View file @
1b616990
...
@@ -14,57 +14,41 @@ namespace detail {
...
@@ -14,57 +14,41 @@ namespace detail {
template
<
typename
OldLayout
>
template
<
typename
OldLayout
>
CK_TILE_HOST
std
::
vector
<
std
::
size_t
>
get_layout_transpose_gnchw_to_old
()
CK_TILE_HOST
std
::
vector
<
std
::
size_t
>
get_layout_transpose_gnchw_to_old
()
{
{
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCW
>
||
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCX
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKW
>
)
if
constexpr
(
is_any_of
<
OldLayout
,
GNCW
,
GKCX
,
GNKW
>::
value
)
{
{
return
{
0
,
1
,
2
,
3
};
return
{
0
,
1
,
2
,
3
};
}
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCHW
>
||
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNCHW
,
GKCYX
,
GNKHW
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCYX
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKHW
>
)
{
{
return
{
0
,
1
,
2
,
3
,
4
};
return
{
0
,
1
,
2
,
3
,
4
};
}
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCDHW
>
||
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNCDHW
,
GKCZYX
,
GNKDHW
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCZYX
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKDHW
>
)
{
{
return
{
0
,
1
,
2
,
3
,
4
,
5
};
return
{
0
,
1
,
2
,
3
,
4
,
5
};
}
}
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWC
>
||
if
constexpr
(
is_any_of
<
OldLayout
,
GNWC
,
GKXC
,
GNWK
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKXC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWK
>
)
{
{
return
{
0
,
1
,
3
,
2
};
return
{
0
,
1
,
3
,
2
};
}
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWC
>
||
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNHWC
,
GKYXC
,
GNHWK
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKYXC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWK
>
)
{
{
return
{
0
,
1
,
4
,
2
,
3
};
return
{
0
,
1
,
4
,
2
,
3
};
}
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWC
>
||
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNDHWC
,
GKZYXC
,
GNDHWK
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKZYXC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWK
>
)
{
{
return
{
0
,
1
,
5
,
2
,
3
,
4
};
return
{
0
,
1
,
5
,
2
,
3
,
4
};
}
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGC
>
||
else
if
constexpr
(
is_any_of
<
OldLayout
,
NWGC
,
KXGC
,
NWGK
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KXGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGK
>
)
{
{
return
{
2
,
0
,
3
,
1
};
return
{
2
,
0
,
3
,
1
};
}
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGC
>
||
else
if
constexpr
(
is_any_of
<
OldLayout
,
NHWGC
,
KYXGC
,
NHWGK
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KYXGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGK
>
)
{
{
return
{
3
,
0
,
4
,
1
,
2
};
return
{
3
,
0
,
4
,
1
,
2
};
}
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGC
>
||
else
if
constexpr
(
is_any_of
<
OldLayout
,
NDHWGC
,
KZYXGC
,
NDHWGK
>::
value
)
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KZYXGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGK
>
)
{
{
return
{
4
,
0
,
5
,
1
,
2
,
3
};
return
{
4
,
0
,
5
,
1
,
2
,
3
};
}
}
...
@@ -83,11 +67,11 @@ template <typename InLayout>
...
@@ -83,11 +67,11 @@ template <typename InLayout>
CK_TILE_HOST
HostTensorDescriptor
CK_TILE_HOST
HostTensorDescriptor
make_input_host_tensor_descriptor_g_n_c_wis_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
make_input_host_tensor_descriptor_g_n_c_wis_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
{
{
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
if
constexpr
(
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCW
>
||
if
constexpr
(
is_any_of
<
InLayout
,
GNCW
,
GNCHW
,
GNCDHW
>::
value
)
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCHW
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCDHW
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
@@ -97,9 +81,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
...
@@ -97,9 +81,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
param
.
input_spatial_lengths_
.
begin
(),
param
.
input_spatial_lengths_
.
begin
(),
param
.
input_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
param
.
input_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWC
>
||
else
if
constexpr
(
is_any_of
<
InLayout
,
GNWC
,
GNHWC
,
GNDHWC
>::
value
)
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWC
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWC
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
@@ -109,9 +91,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
...
@@ -109,9 +91,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
param
.
input_spatial_lengths_
.
begin
(),
param
.
input_spatial_lengths_
.
begin
(),
param
.
input_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
param
.
input_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGC
>
||
else
if
constexpr
(
is_any_of
<
InLayout
,
NWGC
,
NHWGC
,
NDHWGC
>::
value
)
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGC
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGC
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
N_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
...
@@ -139,11 +119,11 @@ template <typename WeiLayout>
...
@@ -139,11 +119,11 @@ template <typename WeiLayout>
CK_TILE_HOST
HostTensorDescriptor
CK_TILE_HOST
HostTensorDescriptor
make_weight_host_tensor_descriptor_g_k_c_xs_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
make_weight_host_tensor_descriptor_g_k_c_xs_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
{
{
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KXC
>
||
if
constexpr
(
is_any_of
<
WeiLayout
,
KXC
,
KYXC
,
KZYXC
>::
value
)
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KYXC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KZYXC
>
)
{
{
if
(
param
.
G_
!=
1
)
if
(
param
.
G_
!=
1
)
{
{
...
@@ -157,9 +137,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
...
@@ -157,9 +137,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCX
>
||
else
if
constexpr
(
is_any_of
<
WeiLayout
,
GKCX
,
GKCYX
,
GKCZYX
>::
value
)
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCYX
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCZYX
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
K_
),
static_cast
<
std
::
size_t
>
(
param
.
K_
),
...
@@ -169,9 +147,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
...
@@ -169,9 +147,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKXC
>
||
else
if
constexpr
(
is_any_of
<
WeiLayout
,
GKXC
,
GKYXC
,
GKZYXC
>::
value
)
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKYXC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKZYXC
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
K_
),
static_cast
<
std
::
size_t
>
(
param
.
K_
),
...
@@ -181,9 +157,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
...
@@ -181,9 +157,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KXGC
>
||
else
if
constexpr
(
is_any_of
<
WeiLayout
,
KXGC
,
KYXGC
,
KZYXGC
>::
value
)
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KYXGC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KZYXGC
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
K_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
K_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
...
@@ -211,11 +185,11 @@ template <typename OutLayout>
...
@@ -211,11 +185,11 @@ template <typename OutLayout>
CK_TILE_HOST
HostTensorDescriptor
CK_TILE_HOST
HostTensorDescriptor
make_output_host_tensor_descriptor_g_n_k_wos_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
make_output_host_tensor_descriptor_g_n_k_wos_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
{
{
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
if
constexpr
(
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKW
>
||
if
constexpr
(
is_any_of
<
OutLayout
,
GNKW
,
GNKHW
,
GNKDHW
>::
value
)
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKHW
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKDHW
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
@@ -226,9 +200,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
...
@@ -226,9 +200,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
param
.
output_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
param
.
output_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
}
// separate from legacy code above
// separate from legacy code above
else
if
constexpr
(
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWK
>
||
else
if
constexpr
(
is_any_of
<
OutLayout
,
GNWK
,
GNHWK
,
GNDHWK
>::
value
)
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWK
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWK
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
@@ -238,9 +210,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
...
@@ -238,9 +210,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
param
.
output_spatial_lengths_
.
begin
(),
param
.
output_spatial_lengths_
.
begin
(),
param
.
output_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
param
.
output_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
}
else
if
constexpr
(
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGK
>
||
else
if
constexpr
(
is_any_of
<
OutLayout
,
NWGK
,
NHWGK
,
NDHWGK
>::
value
)
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGK
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGK
>
)
{
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
N_
),
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
...
...
include/ck_tile/host/host_tensor.hpp
View file @
1b616990
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -678,4 +678,43 @@ struct HostTensor
...
@@ -678,4 +678,43 @@ struct HostTensor
Descriptor
mDesc
;
Descriptor
mDesc
;
Data
mData
;
Data
mData
;
};
};
template
<
bool
is_row_major
>
auto
host_tensor_descriptor
(
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
bool_constant
<
is_row_major
>
)
{
using
namespace
ck_tile
::
literals
;
if
constexpr
(
is_row_major
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
}
template
<
bool
is_row_major
>
auto
get_default_stride
(
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
bool_constant
<
is_row_major
>
)
{
if
(
stride
==
0
)
{
if
constexpr
(
is_row_major
)
{
return
col
;
}
else
{
return
row
;
}
}
else
return
stride
;
}
}
// namespace ck_tile
}
// namespace ck_tile
include/ck_tile/host/reference/reference_batched_transpose.hpp
0 → 100644
View file @
1b616990
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
Type
>
CK_TILE_HOST
void
reference_batched_transpose
(
const
HostTensor
<
Type
>&
x
,
HostTensor
<
Type
>&
y
,
std
::
string
layout_in
=
"NCHW"
,
std
::
string
layout_out
=
"NHWC"
)
{
const
int
N
=
x
.
mDesc
.
get_lengths
()[
0
];
auto
f
=
[
&
](
auto
batch
)
{
if
(
layout_in
==
"NCHW"
&&
layout_out
==
"NHWC"
)
{
const
int
C
=
x
.
mDesc
.
get_lengths
()[
1
];
const
int
H
=
x
.
mDesc
.
get_lengths
()[
2
];
const
int
W
=
x
.
mDesc
.
get_lengths
()[
3
];
for
(
int
c
=
0
;
c
<
C
;
++
c
)
{
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
Type
v_x
=
x
(
batch
,
c
,
h
,
w
);
y
(
batch
,
h
,
w
,
c
)
=
v_x
;
}
}
}
}
else
if
(
layout_in
==
"NHWC"
&&
layout_out
==
"NCHW"
)
{
const
int
H
=
x
.
mDesc
.
get_lengths
()[
1
];
const
int
W
=
x
.
mDesc
.
get_lengths
()[
2
];
const
int
C
=
x
.
mDesc
.
get_lengths
()[
3
];
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
for
(
int
c
=
0
;
c
<
C
;
++
c
)
{
Type
v_x
=
x
(
batch
,
h
,
w
,
c
);
y
(
batch
,
c
,
h
,
w
)
=
v_x
;
}
}
}
}
};
make_ParallelTensorFunctor
(
f
,
N
)(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_fused_moe.hpp
View file @
1b616990
...
@@ -73,7 +73,7 @@ void reference_fused_moe(
...
@@ -73,7 +73,7 @@ void reference_fused_moe(
ck_tile
::
index_t
tokens
,
ck_tile
::
index_t
tokens
,
ck_tile
::
index_t
experts
,
ck_tile
::
index_t
experts
,
ck_tile
::
index_t
hidden_size
,
ck_tile
::
index_t
hidden_size
,
ck_tile
::
index_t
intermediate_size
,
// this size is for gate/up
ck_tile
::
index_t
intermediate_size
,
// this size is for gate/up
/down
ck_tile
::
index_t
topk
,
ck_tile
::
index_t
topk
,
ck_tile
::
index_t
gate_only
)
ck_tile
::
index_t
gate_only
)
{
{
...
@@ -82,19 +82,8 @@ void reference_fused_moe(
...
@@ -82,19 +82,8 @@ void reference_fused_moe(
assert
(
sorted_expert_ids_host
.
get_num_of_dimension
()
==
1
);
assert
(
sorted_expert_ids_host
.
get_num_of_dimension
()
==
1
);
assert
(
num_sorted_tiles_host
.
get_element_size
()
==
1
);
assert
(
num_sorted_tiles_host
.
get_element_size
()
==
1
);
ck_tile
::
index_t
num_sorted_tiles
=
num_sorted_tiles_host
.
mData
[
0
]
/
block_m
;
ck_tile
::
index_t
num_sorted_tiles
=
num_sorted_tiles_host
.
mData
[
0
]
/
block_m
;
ck_tile
::
index_t
intermediate_size_0
=
intermediate_size
;
ck_tile
::
index_t
intermediate_size_0
=
intermediate_size
*
(
gate_only
?
1
:
2
);
ck_tile
::
index_t
intermediate_size_1
=
intermediate_size
/
(
gate_only
?
1
:
2
);
ck_tile
::
index_t
intermediate_size_1
=
intermediate_size
;
// TODO: better remove this in the future, or modify the token_id value
auto
get_topk_id
=
[
&
](
ck_tile
::
index_t
token_id_
,
ck_tile
::
index_t
expert_id_
)
{
for
(
ck_tile
::
index_t
i_
=
0
;
i_
<
topk
;
i_
++
)
{
if
(
token_ids_host
(
token_id_
,
i_
)
==
expert_id_
)
return
i_
;
}
throw
std
::
runtime_error
(
"not correct token/expert pair
\n
"
);
return
-
1
;
// TODO: not correct!!
};
ck_tile
::
HostTensor
<
AccDataType
>
out_topk_tokens
({
tokens
,
topk
,
hidden_size
});
ck_tile
::
HostTensor
<
AccDataType
>
out_topk_tokens
({
tokens
,
topk
,
hidden_size
});
...
@@ -105,11 +94,31 @@ void reference_fused_moe(
...
@@ -105,11 +94,31 @@ void reference_fused_moe(
if
(
i_tile
>=
num_sorted_tiles
)
if
(
i_tile
>=
num_sorted_tiles
)
return
;
return
;
ck_tile
::
index_t
i_expert
=
sorted_expert_ids_host
.
mData
[
i_tile
];
ck_tile
::
index_t
i_expert
=
sorted_expert_ids_host
.
mData
[
i_tile
];
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
ck_tile
::
index_t
i_topk
=
i_token
>>
24
;
i_token
&=
0xffffff
;
if
(
i_token
>=
tokens
)
return
;
(
void
)
token_ids_host
;
#else
// TODO: better remove this in the future, or modify the token_id value
auto
get_topk_id
=
[
&
](
ck_tile
::
index_t
token_id_
,
ck_tile
::
index_t
expert_id_
)
{
for
(
ck_tile
::
index_t
i_
=
0
;
i_
<
topk
;
i_
++
)
{
if
(
token_ids_host
(
token_id_
,
i_
)
==
expert_id_
)
return
i_
;
}
throw
std
::
runtime_error
(
"not correct token/expert pair
\n
"
);
return
-
1
;
// TODO: not correct!!
};
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
if
(
i_token
>=
tokens
)
if
(
i_token
>=
tokens
)
return
;
return
;
ck_tile
::
index_t
i_topk
=
get_topk_id
(
i_token
,
i_expert
);
// TODO: ugly
ck_tile
::
index_t
i_topk
=
get_topk_id
(
i_token
,
i_expert
);
// TODO: ugly
auto
weight
=
sorted_weight_host
.
mData
[
i_flatten
];
#endif
auto
weight
=
sorted_weight_host
.
mData
[
i_flatten
];
ck_tile
::
HostTensor
<
AccDataType
>
acc_0
({
1
,
intermediate_size_0
});
ck_tile
::
HostTensor
<
AccDataType
>
acc_0
({
1
,
intermediate_size_0
});
// first gemm
// first gemm
...
...
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
View file @
1b616990
...
@@ -8,16 +8,40 @@
...
@@ -8,16 +8,40 @@
namespace
ck_tile
{
namespace
ck_tile
{
// Note: for simplicity, each functor only care about single M
struct
reference_rmsnorm2d_default_epilogue
{
template
<
typename
OutDataType
,
typename
AccDataType
>
void
operator
()(
int
m
,
HostTensor
<
OutDataType
>&
o
,
const
HostTensor
<
AccDataType
>&
acc
)
{
const
int
N
=
acc
.
mDesc
.
get_lengths
()[
1
];
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
o
(
m
,
n
)
=
ck_tile
::
type_convert
<
OutDataType
>
(
acc
(
m
,
n
));
}
}
template
<
typename
OutDataType
,
typename
AccDataType
>
auto
operator
()(
int
m
,
const
HostTensor
<
AccDataType
>&
acc
)
{
HostTensor
<
OutDataType
>
o
(
acc
.
get_lengths
(),
acc
.
get_strides
());
operator
()(
m
,
o
,
acc
);
return
o
;
}
};
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
ComputeDataType
,
typename
ComputeDataType
,
typename
YDataType
,
typename
YDataType
,
typename
InvRmsDataType
>
typename
InvRmsDataType
,
typename
Epilogue
=
reference_rmsnorm2d_default_epilogue
>
void
reference_rmsnorm2d_fwd
(
const
HostTensor
<
XDataType
>&
x_m_n
,
void
reference_rmsnorm2d_fwd
(
const
HostTensor
<
XDataType
>&
x_m_n
,
const
HostTensor
<
GammaDataType
>&
gamma_n
,
const
HostTensor
<
GammaDataType
>&
gamma_n
,
HostTensor
<
YDataType
>&
y_m_n
,
HostTensor
<
YDataType
>&
y_m_n
,
HostTensor
<
InvRmsDataType
>&
invRms_m
,
HostTensor
<
InvRmsDataType
>&
invRms_m
,
ComputeDataType
epsilon
)
ComputeDataType
epsilon
,
Epilogue
epilogue_functor
=
{})
{
{
auto
rmsnorm2d_fwd_func
=
[
&
](
auto
m
)
{
auto
rmsnorm2d_fwd_func
=
[
&
](
auto
m
)
{
const
int
N
=
x_m_n
.
mDesc
.
get_lengths
()[
1
];
const
int
N
=
x_m_n
.
mDesc
.
get_lengths
()[
1
];
...
@@ -37,13 +61,15 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
...
@@ -37,13 +61,15 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
if
constexpr
(
!
std
::
is_same_v
<
InvRmsDataType
,
ck_tile
::
null_type
>
)
if
constexpr
(
!
std
::
is_same_v
<
InvRmsDataType
,
ck_tile
::
null_type
>
)
invRms_m
(
m
)
=
ck_tile
::
type_convert
<
InvRmsDataType
>
(
divisor
);
invRms_m
(
m
)
=
ck_tile
::
type_convert
<
InvRmsDataType
>
(
divisor
);
HostTensor
<
ComputeDataType
>
acc
(
x_m_n
.
get_lengths
(),
x_m_n
.
get_strides
());
for
(
int
n
=
0
;
n
<
N
;
++
n
)
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
{
ComputeDataType
x
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
x_m_n
(
m
,
n
));
ComputeDataType
x
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
x_m_n
(
m
,
n
));
ComputeDataType
gamma
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
gamma_n
(
n
));
ComputeDataType
gamma
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
gamma_n
(
n
));
auto
y
=
x
*
divisor
*
gamma
;
acc
(
m
,
n
)
=
x
*
divisor
*
gamma
;
y_m_n
(
m
,
n
)
=
ck_tile
::
type_convert
<
YDataType
>
(
y
);
}
}
epilogue_functor
(
m
,
y_m_n
,
acc
);
};
};
make_ParallelTensorFunctor
(
rmsnorm2d_fwd_func
,
invRms_m
.
mDesc
.
get_lengths
()[
0
])(
make_ParallelTensorFunctor
(
rmsnorm2d_fwd_func
,
invRms_m
.
mDesc
.
get_lengths
()[
0
])(
...
...
include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
View file @
1b616990
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
include/ck_tile/ops/
welford
.hpp
→
include/ck_tile/ops/
batched_transpose
.hpp
View file @
1b616990
...
@@ -3,8 +3,9 @@
...
@@ -3,8 +3,9 @@
#pragma once
#pragma once
#include "ck_tile/ops/welford/block/block_welford.hpp"
#include "ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp"
#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp"
#include "ck_tile/ops/welford/thread/thread_welford.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
0 → 100644
View file @
1b616990
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include <string>
#include <type_traits>
namespace
ck_tile
{
struct
BatchedTransposeHostArgs
{
const
void
*
p_input
;
void
*
p_output
;
index_t
batch
;
index_t
height
;
index_t
width
;
// index_t dim_blocks;
index_t
dim_stride
;
index_t
dim_block_h
;
index_t
dim_block_w
;
};
template
<
typename
Pipeline_
>
struct
BatchedTransposeKernel
{
using
Pipeline
=
remove_cvref_t
<
Pipeline_
>
;
using
Problem
=
remove_cvref_t
<
typename
Pipeline
::
Problem
>
;
using
Type
=
typename
Problem
::
InputType
;
struct
BatchedTransposeKargs
{
const
void
*
p_input
;
void
*
p_output
;
index_t
batch
;
index_t
height
;
index_t
width
;
index_t
dim_stride
;
};
using
Kargs
=
BatchedTransposeKargs
;
using
Hargs
=
BatchedTransposeHostArgs
;
CK_TILE_HOST
static
constexpr
auto
GridSize
(
const
Hargs
&
h
)
{
size_t
grid_size_x
=
(
h
.
width
+
h
.
dim_block_w
-
1
)
/
h
.
dim_block_w
;
size_t
grid_size_y
=
(
h
.
height
+
h
.
dim_block_h
-
1
)
/
h
.
dim_block_h
;
size_t
grid_size_z
=
h
.
batch
;
return
dim3
(
grid_size_x
,
grid_size_y
,
grid_size_z
);
}
CK_TILE_HOST
static
constexpr
auto
MakeKargs
(
const
Hargs
&
h
)
{
Kargs
k
;
k
.
p_input
=
h
.
p_input
;
k
.
p_output
=
h
.
p_output
;
k
.
batch
=
h
.
batch
;
k
.
height
=
h
.
height
;
k
.
width
=
h
.
width
;
k
.
dim_stride
=
h
.
dim_stride
;
return
k
;
}
CK_TILE_HOST_DEVICE
static
constexpr
auto
BlockSize
()
{
return
Problem
::
kBlockSize
;
}
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
static
constexpr
ck_tile
::
index_t
kMPerBlock
=
Problem
::
kMPerBlock
;
static
constexpr
ck_tile
::
index_t
kNPerBlock
=
Problem
::
kNPerBlock
;
static
constexpr
bool
kPadM
=
Problem
::
kPadM
;
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
static
constexpr
ck_tile
::
index_t
kMPerThread
=
Problem
::
kMPerThread
;
static
constexpr
ck_tile
::
index_t
kNPerThread
=
Problem
::
kNPerThread
;
static_assert
(
kMPerThread
==
1
&&
kNPerThread
==
1
);
const
auto
iDim
=
blockIdx
.
z
;
const
auto
x_m_n
=
[
&
]()
{
const
auto
x_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
Type
*>
(
kargs
.
p_input
)
+
iDim
*
kargs
.
dim_stride
,
make_tuple
(
kargs
.
height
,
kargs
.
width
),
make_tuple
(
kargs
.
width
,
1
),
number
<
kNPerThread
>
{},
// TODO thread load value
number
<
1
>
{});
return
pad_tensor_view
(
x_dram_naive
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
sequence
<
kPadM
,
kPadN
>
{});
}();
const
auto
iM
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
x
*
kMPerBlock
);
const
auto
iN
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
kNPerBlock
);
const
auto
y_n_m
=
[
&
]()
{
const
auto
y_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
Type
*>
(
kargs
.
p_output
)
+
iDim
*
kargs
.
dim_stride
,
make_tuple
(
kargs
.
width
,
kargs
.
height
),
make_tuple
(
kargs
.
height
,
1
),
number
<
kMPerThread
>
{},
number
<
1
>
{});
return
pad_tensor_view
(
y_dram_naive
,
make_tuple
(
number
<
kNPerBlock
>
{},
number
<
kMPerBlock
>
{}),
sequence
<
kPadN
,
kPadM
>
{});
}();
auto
x_block_window
=
make_tile_window
(
x_m_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
static_cast
<
ck_tile
::
index_t
>
(
iM
*
kMPerBlock
),
static_cast
<
ck_tile
::
index_t
>
(
iN
*
kNPerBlock
)});
auto
y_block_window
=
make_tile_window
(
y_n_m
,
make_tuple
(
number
<
kNPerBlock
>
{},
number
<
kMPerBlock
>
{}),
{
static_cast
<
ck_tile
::
index_t
>
(
iN
*
kNPerBlock
),
static_cast
<
ck_tile
::
index_t
>
(
iM
*
kMPerBlock
)});
Pipeline
{}(
x_block_window
,
y_block_window
);
}
};
}
// namespace ck_tile
include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
0 → 100644
View file @
1b616990
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
#include <string>
#include <type_traits>
namespace
ck_tile
{
template
<
typename
Problem_
,
typename
Policy_
=
BatchedTransposePolicy
>
struct
BatchedTransposePipeline
{
// TODO: this kernel only support warp per row
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
Policy
=
remove_cvref_t
<
Policy_
>
;
using
InputType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
InputType
>
;
static
constexpr
ck_tile
::
index_t
kMPerBlock
=
Problem
::
kMPerBlock
;
static
constexpr
ck_tile
::
index_t
kNPerBlock
=
Problem
::
kNPerBlock
;
static
constexpr
index_t
AlignmentM
=
Problem
::
AlignmentM
;
static
constexpr
index_t
AlignmentN
=
Problem
::
AlignmentN
;
static
constexpr
bool
kPadM
=
Problem
::
kPadM
;
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
template
<
typename
InputWindow
,
typename
OutputWindow
>
CK_TILE_DEVICE
auto
operator
()(
const
InputWindow
&
input_window
,
OutputWindow
&
out_window
)
{
auto
inp_win
=
make_tile_window
(
input_window
,
Policy
::
template
MakeInputDistribution
<
Problem
>());
auto
out_win
=
make_tile_window
(
out_window
,
Policy
::
template
MakeOutputDistribution
<
Problem
>());
auto
x
=
load_tile
(
inp_win
);
// x->thread input_win->block
auto
y
=
make_static_distributed_tensor
<
InputType
>
(
Policy
::
template
MakeOutputDistribution
<
Problem
>());
constexpr
auto
span_2d_x
=
decltype
(
x
)
::
get_distributed_spans
();
sweep_tile_span
(
span_2d_x
[
number
<
0
>
{}],
[
&
](
auto
idx0
)
{
sweep_tile_span
(
span_2d_x
[
number
<
1
>
{}],
[
&
](
auto
idx1
)
{
constexpr
auto
i_j_idx
=
make_tuple
(
idx1
,
idx0
);
y
(
i_j_idx
)
=
x
(
i_j_idx
);
});
});
store_tile
(
out_win
,
y
);
}
};
}
// namespace ck_tile
include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
0 → 100644
View file @
1b616990
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/softmax.hpp"
#include "ck_tile/ops/topk.hpp"
namespace
ck_tile
{
struct
BatchedTransposePolicy
{
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeInputDistribution
()
{
using
S
=
Problem
;
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
S
::
kMWarpPerBlock
,
S
::
kMThreadPerWarp
,
S
::
kMPerThread
>
,
sequence
<
S
::
kNWarpPerBlock
,
S
::
kNThreadPerWarp
,
S
::
kNPerThread
>>
,
tuple
<
sequence
<
1
,
2
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
0
,
0
>
,
sequence
<
1
,
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
2
,
2
>>
{});
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeOutputDistribution
()
{
using
S
=
Problem
;
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
S
::
kNWarpPerBlock
,
S
::
kNThreadPerWarp
,
S
::
kNPerThread
>
,
sequence
<
S
::
kMWarpPerBlock
,
S
::
kMThreadPerWarp
,
S
::
kMPerThread
>>
,
tuple
<
sequence
<
2
,
1
>
,
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>
,
sequence
<
1
,
1
>>
,
sequence
<
2
,
1
>
,
sequence
<
2
,
2
>>
{});
}
};
}
// namespace ck_tile
include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
0 → 100644
View file @
1b616990
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include <string>
#include <type_traits>
#define VectorLoadSize 16
namespace
ck_tile
{
template
<
typename
InputType_
,
typename
BlockTile
,
// Sequence<...
typename
WarpTile
,
// Sequence<...
typename
ThreadTile
,
// Sequence<...
bool
kPadM_
=
true
,
bool
kPadN_
=
true
>
struct
BatchedTransposeProblem
{
using
InputType
=
remove_cvref_t
<
InputType_
>
;
static
constexpr
index_t
kMPerThread
=
ThreadTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
kNPerThread
=
ThreadTile
::
at
(
number
<
1
>
{});
static
constexpr
index_t
kMPerWarp
=
WarpTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
kNPerWarp
=
WarpTile
::
at
(
number
<
1
>
{});
static
constexpr
index_t
kMThreadPerWarp
=
kMPerWarp
/
kMPerThread
;
static
constexpr
index_t
kNThreadPerWarp
=
kNPerWarp
/
kNPerThread
;
static
constexpr
index_t
kMPerBlock
=
BlockTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
kNPerBlock
=
BlockTile
::
at
(
number
<
1
>
{});
static
constexpr
index_t
kMWarpPerBlock
=
kMPerBlock
/
kMPerWarp
;
static
constexpr
index_t
kNWarpPerBlock
=
kNPerBlock
/
kNPerWarp
;
static
constexpr
index_t
kBlockSize
=
kMThreadPerWarp
*
kNThreadPerWarp
*
kMWarpPerBlock
*
kNWarpPerBlock
;
static
constexpr
bool
kPadM
=
kPadM_
;
static
constexpr
bool
kPadN
=
kPadN_
;
static
constexpr
index_t
AlignmentM
=
kPadM
?
VectorLoadSize
/
sizeof
(
InputType
)
:
1
;
// TODO
static
constexpr
index_t
AlignmentN
=
kPadN
?
VectorLoadSize
/
sizeof
(
InputType
)
:
1
;
};
}
// namespace ck_tile
include/ck_tile/ops/common.hpp
View file @
1b616990
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
include/ck_tile/ops/elementwise.hpp
View file @
1b616990
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
View file @
1b616990
...
@@ -719,7 +719,82 @@ struct Silu
...
@@ -719,7 +719,82 @@ struct Silu
constexpr
T
one
=
type_convert
<
T
>
(
1
);
constexpr
T
one
=
type_convert
<
T
>
(
1
);
y
=
x
*
(
one
/
(
one
+
ck_tile
::
exp
(
-
x
)));
y
=
x
*
(
one
/
(
one
+
ck_tile
::
exp
(
-
x
)));
};
};
template
<
>
CK_TILE_HOST_DEVICE
void
operator
()
<
fp32x2_t
>
(
fp32x2_t
&
y
,
const
fp32x2_t
&
x
)
const
{
constexpr
auto
one
=
type_convert
<
float
>
(
1
);
y
[
0
]
=
x
[
0
]
*
__builtin_amdgcn_rcpf
(
one
+
ck_tile
::
exp
(
-
x
[
0
]));
y
[
1
]
=
x
[
1
]
*
__builtin_amdgcn_rcpf
(
one
+
ck_tile
::
exp
(
-
x
[
1
]));
};
};
#if 0
// Silu, the formular is not so good to do inline asm (dependency)
// we put the code here purposely if in the future ppl want to try
struct SiluAsm
{
template <typename T>
CK_TILE_HOST void operator()(T& y, T& x) const
{
static_assert(std::is_same_v<T, float>, "Data type is not supported by this operation!");
constexpr T one = type_convert<T>(1);
y = x * (one / (one + ck_tile::exp(-x)));
};
template <typename T>
CK_TILE_DEVICE void operator()(T& y, T& x) const
{
static_assert(std::is_same_v<T, float>, "Data type is not supported by this operation!");
const uint32_t log2e_neg_ = 0x3fb8aa3b | 0x80000000; // log2e_v<float> * -1;
// NOTE: x/y can't be same register before inline asm
// "+v" as y, "v" as x is not enought, x/y stil maybe put to same register
T tmp = x;
asm volatile("v_mul_f32 %[v_y], %[s_log2e], %[v_x]\n"
"v_exp_f32 %[v_y], %[v_y]\n"
"s_nop 0 ; hazard for exp\n"
"v_add_f32 %[v_y], %[v_y], 1.0\n"
"v_rcp_f32 %[v_y], %[v_y]\n"
"s_nop 0 ; hazard for rcp\n"
"v_mul_f32 %[v_y], %[v_x], %[v_y]\n"
: [v_y] "+v"(y), [v_x] "+v"(tmp)
: [s_log2e] "s"(log2e_neg_)
:);
};
template <>
CK_TILE_HOST void operator()<fp32x2_t>(fp32x2_t& y, fp32x2_t& x) const
{
constexpr auto one = type_convert<float>(1);
y[0] = x[0] * (one / (one + ck_tile::exp(-x[0])));
y[1] = x[1] * (one / (one + ck_tile::exp(-x[1])));
};
template <>
CK_TILE_DEVICE void operator()<fp32x2_t>(fp32x2_t& y, fp32x2_t& x) const
{
const uint32_t log2e_neg_ = 0x3fb8aa3b | 0x80000000; // log2e_v<float> * -1;
// NOTE: x/y can't be same register before inline asm
// float tmp0 = x[0], tmp1 = x[1];
asm volatile("v_mul_f32 %[v_y0], %[s_log2e], %[v_x0]\n"
"v_mul_f32 %[v_y1], %[s_log2e], %[v_x1]\n"
"v_exp_f32 %[v_y0], %[v_y0]\n"
"v_exp_f32 %[v_y1], %[v_y1]\n"
"v_add_f32 %[v_y0], %[v_y0], 1.0\n"
"v_add_f32 %[v_y1], %[v_y1], 1.0\n"
"v_rcp_f32 %[v_y0], %[v_y0]\n"
"v_rcp_f32 %[v_y1], %[v_y1]\n"
"v_mul_f32 %[v_y0], %[v_x0], %[v_y0]\n"
"v_mul_f32 %[v_y1], %[v_x1], %[v_y1]\n"
: [v_y0] "+v"(y[0]), [v_y1] "+v"(y[1]), [v_x0] "+v"(x[0]), [v_x1] "+v"(x[1])
: [s_log2e] "s"(log2e_neg_)
:);
};
};
};
#endif
struct
TanH
struct
TanH
{
{
...
...
include/ck_tile/ops/epilogue.hpp
View file @
1b616990
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
Prev
1
…
12
13
14
15
16
17
18
19
20
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment