Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
8d2f2f8c
Commit
8d2f2f8c
authored
Dec 05, 2024
by
coderfeli
Browse files
Merge branch 'develop' into ck_tile/gemm_debug_alias
parents
99c8123f
4cb3d7d7
Changes
91
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1723 additions
and
134 deletions
+1723
-134
example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+53
-0
example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+191
-0
example/ck_tile/17_grouped_gemm/utils.hpp
example/ck_tile/17_grouped_gemm/utils.hpp
+38
-0
example/ck_tile/CMakeLists.txt
example/ck_tile/CMakeLists.txt
+1
-0
include/ck/README.md
include/ck/README.md
+19
-0
include/ck/library/utility/host_tensor.hpp
include/ck/library/utility/host_tensor.hpp
+1
-1
include/ck/library/utility/host_tensor_generator.hpp
include/ck/library/utility/host_tensor_generator.hpp
+25
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
...device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+1
-2
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
...ion/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+6
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
...grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+5
-5
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
...device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+4
-4
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
...sor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+6
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
...u/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+5
-5
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
...u/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+1
-2
include/ck/utility/amd_buffer_addressing.hpp
include/ck/utility/amd_buffer_addressing.hpp
+8
-6
include/ck/utility/amd_ck_fp8.hpp
include/ck/utility/amd_ck_fp8.hpp
+988
-0
include/ck/utility/amd_xdlops.hpp
include/ck/utility/amd_xdlops.hpp
+1
-1
include/ck/utility/data_type.hpp
include/ck/utility/data_type.hpp
+360
-83
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+2
-2
include/ck/utility/random_gen.hpp
include/ck/utility/random_gen.hpp
+8
-5
No files found.
example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
0 → 100644
View file @
8d2f2f8c
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
template
<
typename
DataType
>
struct
GemmBasicTypeConfig
;
template
<
>
struct
GemmBasicTypeConfig
<
ck_tile
::
half_t
>
{
using
ADataType
=
ck_tile
::
half_t
;
using
BDataType
=
ck_tile
::
half_t
;
using
CDataType
=
ck_tile
::
half_t
;
using
AccDataType
=
float
;
};
using
Types
=
GemmBasicTypeConfig
<
ck_tile
::
half_t
>
;
// Specific type aliases for easy access
using
ADataType
=
Types
::
ADataType
;
using
BDataType
=
Types
::
BDataType
;
using
AccDataType
=
Types
::
AccDataType
;
using
CDataType
=
Types
::
CDataType
;
using
grouped_gemm_kargs
=
ck_tile
::
GroupedGemmHostArgs
;
auto
create_args
(
int
argc
,
char
*
argv
[])
{
ck_tile
::
ArgParser
arg_parser
;
arg_parser
.
insert
(
"a_layout"
,
"R"
,
"A tensor data layout - Row by default"
)
.
insert
(
"b_layout"
,
"R"
,
"B tensor data layout - Row by default"
)
.
insert
(
"c_layout"
,
"R"
,
"C tensor data layout - Row by default"
)
.
insert
(
"validate"
,
"1"
,
"0. No validation, 1. Validation on CPU"
)
.
insert
(
"warmup"
,
"10"
,
"number of iterations before benchmark the kernel"
)
.
insert
(
"repeat"
,
"100"
,
"number of iterations to benchmark the kernel"
)
.
insert
(
"group_count"
,
"16"
,
"group count"
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
return
std
::
make_tuple
(
result
,
arg_parser
);
}
std
::
size_t
GetWorkspaceSize
(
const
std
::
vector
<
grouped_gemm_kargs
>&
gemm_descs
);
float
grouped_gemm_calc
(
const
std
::
vector
<
grouped_gemm_kargs
>&
gemm_descs
,
const
ck_tile
::
stream_config
&
s
,
void
*
p_workspace_
);
example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
0 → 100644
View file @
8d2f2f8c
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
float
invoke_gemm
(
int
n_warmup
,
int
n_repeat
,
int
group_count
,
const
std
::
vector
<
grouped_gemm_kargs
>&
args
)
{
ck_tile
::
DeviceMem
gemm_workspace
;
gemm_workspace
.
Realloc
(
GetWorkspaceSize
(
args
));
float
ave_time
=
grouped_gemm
<
ALayout
,
BLayout
,
CLayout
>
(
args
,
ck_tile
::
stream_config
{
nullptr
,
true
,
1
,
n_warmup
,
n_repeat
},
gemm_workspace
.
GetDeviceBuffer
());
std
::
string
op_name
{
"Grouped Gemm"
};
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
int
j
=
0
;
j
<
group_count
;
++
j
)
{
flop
+=
std
::
size_t
(
2
)
*
args
[
j
]
.
M
*
args
[
j
]
.
N
*
args
[
j
]
.
K
;
num_btype
+=
sizeof
(
ADataType
)
*
args
[
j
]
.
M
*
args
[
j
]
.
K
+
sizeof
(
BDataType
)
*
args
[
j
]
.
K
*
args
[
j
]
.
N
+
sizeof
(
CDataType
)
*
args
[
j
]
.
M
*
args
[
j
]
.
N
;
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
return
ave_time
;
}
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
int
run_grouped_gemm_example_with_layouts
(
int
argc
,
char
*
argv
[],
const
ALayout
a_layout
=
ALayout
{},
const
BLayout
b_layout
=
BLayout
{},
[[
maybe_unused
]]
const
CLayout
c_layout
=
CLayout
{})
{
auto
[
result
,
arg_parser
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
{
return
-
1
;
};
const
int
group_count
=
arg_parser
.
get_int
(
"group_count"
);
const
int
repeat
=
arg_parser
.
get_int
(
"repeat"
);
const
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
std
::
vector
<
ck_tile
::
index_t
>
Ms
;
std
::
vector
<
ck_tile
::
index_t
>
Ns
;
std
::
vector
<
ck_tile
::
index_t
>
Ks
;
std
::
vector
<
ck_tile
::
index_t
>
stride_As
;
std
::
vector
<
ck_tile
::
index_t
>
stride_Bs
;
std
::
vector
<
ck_tile
::
index_t
>
stride_Cs
;
for
(
int
i
=
0
;
i
<
group_count
;
i
++
)
{
Ms
.
push_back
(
256
+
256
*
i
);
Ns
.
push_back
(
128
+
128
*
i
);
Ks
.
push_back
(
128
+
64
*
i
);
stride_As
.
push_back
(
Ks
[
i
]);
stride_Bs
.
push_back
(
Ks
[
i
]);
stride_Cs
.
push_back
(
Ns
[
i
]);
}
std
::
vector
<
ck_tile
::
HostTensor
<
ADataType
>>
a_m_k_tensors
;
std
::
vector
<
ck_tile
::
HostTensor
<
BDataType
>>
b_k_n_tensors
;
std
::
vector
<
ck_tile
::
HostTensor
<
CDataType
>>
c_m_n_tensors
;
a_m_k_tensors
.
reserve
(
group_count
);
b_k_n_tensors
.
reserve
(
group_count
);
c_m_n_tensors
.
reserve
(
group_count
);
std
::
vector
<
std
::
unique_ptr
<
ck_tile
::
DeviceMem
>>
a_m_k_dev_buf
;
std
::
vector
<
std
::
unique_ptr
<
ck_tile
::
DeviceMem
>>
b_k_n_dev_buf
;
std
::
vector
<
std
::
unique_ptr
<
ck_tile
::
DeviceMem
>>
c_m_n_dev_buf
;
a_m_k_dev_buf
.
reserve
(
group_count
);
b_k_n_dev_buf
.
reserve
(
group_count
);
c_m_n_dev_buf
.
reserve
(
group_count
);
std
::
vector
<
grouped_gemm_kargs
>
gemm_descs
;
gemm_descs
.
reserve
(
group_count
);
for
(
int
i
=
0
;
i
<
group_count
;
++
i
)
{
const
ck_tile
::
index_t
M
=
Ms
[
i
];
const
ck_tile
::
index_t
N
=
Ns
[
i
];
const
ck_tile
::
index_t
K
=
Ks
[
i
];
stride_As
[
i
]
=
f_get_default_stride
(
M
,
N
,
stride_As
[
i
],
a_layout
);
stride_Bs
[
i
]
=
f_get_default_stride
(
K
,
N
,
stride_Bs
[
i
],
b_layout
);
stride_Cs
[
i
]
=
f_get_default_stride
(
M
,
N
,
stride_Cs
[
i
],
CLayout
{});
a_m_k_tensors
.
push_back
(
ck_tile
::
HostTensor
<
ADataType
>
(
f_host_tensor_descriptor
(
M
,
K
,
stride_As
[
i
],
a_layout
)));
b_k_n_tensors
.
push_back
(
ck_tile
::
HostTensor
<
BDataType
>
(
f_host_tensor_descriptor
(
K
,
N
,
stride_Bs
[
i
],
b_layout
)));
c_m_n_tensors
.
push_back
(
ck_tile
::
HostTensor
<
CDataType
>
(
f_host_tensor_descriptor
(
M
,
N
,
stride_Cs
[
i
],
CLayout
{})));
std
::
cout
<<
"gemm["
<<
i
<<
"]"
<<
" a_m_k: "
<<
a_m_k_tensors
[
i
]
.
mDesc
<<
" b_k_n: "
<<
b_k_n_tensors
[
i
]
.
mDesc
<<
" c_m_n: "
<<
c_m_n_tensors
[
i
]
.
mDesc
<<
std
::
endl
;
ck_tile
::
FillUniformDistribution
<
ADataType
>
{
-
5.
f
,
5.
f
}(
a_m_k_tensors
[
i
]);
ck_tile
::
FillUniformDistribution
<
BDataType
>
{
-
5.
f
,
5.
f
}(
b_k_n_tensors
[
i
]);
a_m_k_dev_buf
.
push_back
(
std
::
make_unique
<
ck_tile
::
DeviceMem
>
(
a_m_k_tensors
[
i
]
.
get_element_space_size_in_bytes
()));
b_k_n_dev_buf
.
push_back
(
std
::
make_unique
<
ck_tile
::
DeviceMem
>
(
b_k_n_tensors
[
i
]
.
get_element_space_size_in_bytes
()));
c_m_n_dev_buf
.
push_back
(
std
::
make_unique
<
ck_tile
::
DeviceMem
>
(
c_m_n_tensors
[
i
]
.
get_element_space_size_in_bytes
()));
a_m_k_dev_buf
[
i
]
->
ToDevice
(
a_m_k_tensors
[
i
]
.
data
());
b_k_n_dev_buf
[
i
]
->
ToDevice
(
b_k_n_tensors
[
i
]
.
data
());
c_m_n_dev_buf
[
i
]
->
SetZero
();
c_m_n_tensors
[
i
]
.
SetZero
();
const
void
*
p_a
=
a_m_k_dev_buf
[
i
]
->
GetDeviceBuffer
();
const
void
*
p_b
=
b_k_n_dev_buf
[
i
]
->
GetDeviceBuffer
();
void
*
p_c
=
c_m_n_dev_buf
[
i
]
->
GetDeviceBuffer
();
gemm_descs
.
push_back
({
p_a
,
p_b
,
p_c
,
M
,
N
,
K
,
stride_As
[
i
],
stride_Bs
[
i
],
stride_Cs
[
i
]});
}
invoke_gemm
<
ALayout
,
BLayout
,
CLayout
>
(
warmup
,
repeat
,
group_count
,
gemm_descs
);
for
(
int
i
=
0
;
i
<
group_count
;
i
++
)
{
c_m_n_dev_buf
[
i
]
->
FromDevice
(
c_m_n_tensors
[
i
]
.
data
());
}
bool
pass
{
true
};
if
(
arg_parser
.
get_int
(
"validate"
))
{
for
(
int
i
=
0
;
i
<
group_count
;
++
i
)
{
ck_tile
::
HostTensor
<
CDataType
>
c_m_n_host_ref
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
stride_Cs
[
i
],
CLayout
{}));
c_m_n_host_ref
.
SetZero
();
ck_tile
::
reference_gemm
<
ADataType
,
BDataType
,
AccDataType
,
CDataType
>
(
a_m_k_tensors
[
i
],
b_k_n_tensors
[
i
],
c_m_n_host_ref
);
pass
&=
ck_tile
::
check_err
(
c_m_n_tensors
[
i
],
c_m_n_host_ref
);
}
std
::
cout
<<
"The CPU veification result is:"
<<
(
pass
?
"correct"
:
"fail"
)
<<
std
::
endl
;
}
return
pass
;
}
int
run_grouped_gemm_example
(
int
argc
,
char
*
argv
[])
{
auto
[
result
,
arg_parser
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
{
return
-
1
;
}
const
std
::
string
a_layout
=
arg_parser
.
get_str
(
"a_layout"
);
const
std
::
string
b_layout
=
arg_parser
.
get_str
(
"b_layout"
);
using
Row
=
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck_tile
::
tensor_layout
::
gemm
::
ColumnMajor
;
if
(
a_layout
==
"R"
&&
b_layout
==
"C"
)
{
return
run_grouped_gemm_example_with_layouts
(
argc
,
argv
,
Row
{},
Col
{},
Row
{});
}
else
if
(
a_layout
==
"R"
&&
b_layout
==
"R"
)
{
return
run_grouped_gemm_example_with_layouts
(
argc
,
argv
,
Row
{},
Row
{},
Row
{});
}
else
{
throw
std
::
runtime_error
(
"Unsupported data layout configuration for A,B and C tensors!"
);
}
}
example/ck_tile/17_grouped_gemm/utils.hpp
0 → 100644
View file @
8d2f2f8c
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template
<
typename
TLayout
>
constexpr
auto
f_host_tensor_descriptor
(
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
TLayout
layout
)
{
using
namespace
ck_tile
::
literals
;
if
constexpr
(
std
::
is_same_v
<
decltype
(
layout
),
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
return
ck_tile
::
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
ck_tile
::
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
}
template
<
typename
TLayout
>
constexpr
auto
f_get_default_stride
(
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
TLayout
layout
)
{
if
(
stride
==
0
)
{
if
constexpr
(
std
::
is_same_v
<
decltype
(
layout
),
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
return
col
;
}
else
{
return
row
;
}
}
else
return
stride
;
}
example/ck_tile/CMakeLists.txt
View file @
8d2f2f8c
...
@@ -16,3 +16,4 @@ add_subdirectory(13_moe_sorting)
...
@@ -16,3 +16,4 @@ add_subdirectory(13_moe_sorting)
add_subdirectory
(
14_moe_smoothquant
)
add_subdirectory
(
14_moe_smoothquant
)
add_subdirectory
(
15_fused_moe
)
add_subdirectory
(
15_fused_moe
)
add_subdirectory
(
16_batched_gemm
)
add_subdirectory
(
16_batched_gemm
)
add_subdirectory
(
17_grouped_gemm
)
include/ck/README.md
0 → 100644
View file @
8d2f2f8c
[
Back to the main page
](
../../README.md
)
# Composable Kernel supported operations
## Supported device operations
*
[
Average pooling
](
)
*
[
Batched contraction
](
)
*
[
Batched gemm
](
)
*
[
Batchnorm
](
)
*
[
CGEMM
](
)
*
[
Contraction
](
)
*
[
Convolution
](
)
*
[
Image to Column and Column to Image
](
)
*
[
Elementwise
](
)
*
[
GEMM
](
)
*
[
Max pooling
](
)
*
[
Reduce
](
)
*
[
Normalization
](
)
*
[
Permute
](
)
*
[
Put
](
)
*
[
Softmax
](
)
include/ck/library/utility/host_tensor.hpp
View file @
8d2f2f8c
...
@@ -326,7 +326,7 @@ struct Tensor
...
@@ -326,7 +326,7 @@ struct Tensor
std
::
size_t
GetElementSpaceSizeInBytes
()
const
{
return
sizeof
(
T
)
*
GetElementSpaceSize
();
}
std
::
size_t
GetElementSpaceSizeInBytes
()
const
{
return
sizeof
(
T
)
*
GetElementSpaceSize
();
}
void
SetZero
()
{
ck
::
ranges
::
fill
<
T
>
(
mData
,
0
);
}
void
SetZero
()
{
ck
::
ranges
::
fill
<
T
>
(
mData
,
T
{
0
}
);
}
template
<
typename
F
>
template
<
typename
F
>
void
ForEach_impl
(
F
&&
f
,
std
::
vector
<
size_t
>&
idx
,
size_t
rank
)
void
ForEach_impl
(
F
&&
f
,
std
::
vector
<
size_t
>&
idx
,
size_t
rank
)
...
...
include/ck/library/utility/host_tensor_generator.hpp
View file @
8d2f2f8c
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -37,7 +37,7 @@ struct GeneratorTensor_1<ck::half_t>
...
@@ -37,7 +37,7 @@ struct GeneratorTensor_1<ck::half_t>
float
value
=
1.0
;
float
value
=
1.0
;
template
<
typename
...
Is
>
template
<
typename
...
Is
>
ck
::
b
half_t
operator
()(
Is
...)
ck
::
half_t
operator
()(
Is
...)
{
{
return
ck
::
type_convert
<
ck
::
half_t
>
(
value
);
return
ck
::
type_convert
<
ck
::
half_t
>
(
value
);
}
}
...
@@ -62,7 +62,7 @@ struct GeneratorTensor_1<ck::f8_t>
...
@@ -62,7 +62,7 @@ struct GeneratorTensor_1<ck::f8_t>
float
value
=
1.0
;
float
value
=
1.0
;
template
<
typename
...
Is
>
template
<
typename
...
Is
>
ck
::
bhal
f_t
operator
()(
Is
...)
ck
::
f
8
_t
operator
()(
Is
...)
{
{
return
ck
::
type_convert
<
ck
::
f8_t
>
(
value
);
return
ck
::
type_convert
<
ck
::
f8_t
>
(
value
);
}
}
...
@@ -256,14 +256,33 @@ struct GeneratorTensor_Checkboard
...
@@ -256,14 +256,33 @@ struct GeneratorTensor_Checkboard
}
}
};
};
template
<
ck
::
index_t
Dim
>
/**
* @brief Is used to generate sequential values based on the specified dimension.
*
* @tparam T The type of the tensor values.
* @tparam Dim The specific dimension used for generation.
*
* GeneratorTensor_Sequential<1>{} will generate the following values for a 3x3 tensor:
*
* 0 1 2
* 0 1 2
* 0 1 2
*
* Essentially, the values generated are logical coordinates of the generated element that
* correspond to dimension Dim. E.g. for 2-dimensional tensor and Dim=1, the values are the column
* indices.
*
*/
template
<
typename
T
,
ck
::
index_t
Dim
>
struct
GeneratorTensor_Sequential
struct
GeneratorTensor_Sequential
{
{
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
float
operator
()(
Ts
...
Xs
)
const
T
operator
()(
Ts
...
Xs
)
const
{
{
std
::
array
<
ck
::
index_t
,
sizeof
...(
Ts
)
>
dims
=
{{
static_cast
<
ck
::
index_t
>
(
Xs
)...}};
std
::
array
<
ck
::
index_t
,
sizeof
...(
Ts
)
>
dims
=
{{
static_cast
<
ck
::
index_t
>
(
Xs
)...}};
return
dims
[
Dim
];
float
tmp
=
dims
[
Dim
];
return
ck
::
type_convert
<
T
>
(
tmp
);
}
}
};
};
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
View file @
8d2f2f8c
...
@@ -111,8 +111,7 @@ __global__ void
...
@@ -111,8 +111,7 @@ __global__ void
[[
maybe_unused
]]
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
[[
maybe_unused
]]
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
[[
maybe_unused
]]
const
index_t
num_k_per_block
)
[[
maybe_unused
]]
const
index_t
num_k_per_block
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
// offset base pointer for each work-group
// offset base pointer for each work-group
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
*
NumGroupsToMerge
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
*
NumGroupsToMerge
);
const
index_t
k_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
num_k_per_block
);
const
index_t
k_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
num_k_per_block
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
View file @
8d2f2f8c
#pragma once
#pragma once
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -603,11 +603,11 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
...
@@ -603,11 +603,11 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
}
}
hipGetErrorString
(
hipGetErrorString
(
hipMemcpy
WithStream
(
arg
.
p_workspace_
,
hipMemcpy
Async
(
arg
.
p_workspace_
,
arg
.
gemm_desc_kernel_arg_
.
data
(),
arg
.
gemm_desc_kernel_arg_
.
data
(),
arg
.
gemm_desc_kernel_arg_
.
size
()
*
sizeof
(
GemmKernelArg
),
arg
.
gemm_desc_kernel_arg_
.
size
()
*
sizeof
(
GemmKernelArg
),
hipMemcpyHostToDevice
,
hipMemcpyHostToDevice
,
stream_config
.
stream_id_
));
stream_config
.
stream_id_
));
auto
launch_kernel
=
[
&
](
auto
has_main_k_block_loop
,
auto
launch_kernel
=
[
&
](
auto
has_main_k_block_loop
,
auto
has_double_tail_k_block_loop
)
{
auto
has_double_tail_k_block_loop
)
{
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
View file @
8d2f2f8c
...
@@ -761,11 +761,11 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
...
@@ -761,11 +761,11 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
float
time
{
0.
f
};
float
time
{
0.
f
};
hip_check_error
(
hip_check_error
(
hipMemcpy
WithStream
(
dev_gemm_kargs
,
hipMemcpy
Async
(
dev_gemm_kargs
,
arg
.
gemm_kernel_args_
.
data
(),
arg
.
gemm_kernel_args_
.
data
(),
arg
.
gemm_kernel_args_
.
size
()
*
sizeof
(
GemmTransKernelArg
),
arg
.
gemm_kernel_args_
.
size
()
*
sizeof
(
GemmTransKernelArg
),
hipMemcpyHostToDevice
,
hipMemcpyHostToDevice
,
stream_config
.
stream_id_
));
stream_config
.
stream_id_
));
auto
preprocess
=
[
&
]()
{
auto
preprocess
=
[
&
]()
{
hip_check_error
(
hipMemsetAsync
(
hip_check_error
(
hipMemsetAsync
(
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
View file @
8d2f2f8c
...
@@ -940,10 +940,10 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
...
@@ -940,10 +940,10 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
const
void
*
p_host_kernel_args
)
const
const
void
*
p_host_kernel_args
)
const
{
{
arg
.
p_dev_gemm_args_
=
p_dev_kernel_args
;
arg
.
p_dev_gemm_args_
=
p_dev_kernel_args
;
hip_check_error
(
hipMemcpy
(
p_dev_kernel_args
,
hip_check_error
(
hipMemcpy
Async
(
p_dev_kernel_args
,
p_host_kernel_args
,
p_host_kernel_args
,
GetDeviceKernelArgSize
(
&
arg
),
GetDeviceKernelArgSize
(
&
arg
),
hipMemcpyHostToDevice
));
hipMemcpyHostToDevice
));
}
}
virtual
void
SetDeviceKernelArgs
(
BaseArgument
*
p_arg
,
virtual
void
SetDeviceKernelArgs
(
BaseArgument
*
p_arg
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
View file @
8d2f2f8c
...
@@ -557,12 +557,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
...
@@ -557,12 +557,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
}
}
}
}
hipGetErrorString
(
hipMemcpyWithStream
(
arg
.
p_workspace_
,
hipGetErrorString
(
arg
.
gemm_desc_kernel_arg_
.
data
()
,
hipMemcpyAsync
(
arg
.
p_workspace_
,
arg
.
gemm_desc_kernel_arg_
.
size
()
*
arg
.
gemm_desc_kernel_arg_
.
data
(),
sizeof
(
GemmBiasTransKernelArg
),
arg
.
gemm_desc_kernel_arg_
.
size
()
*
sizeof
(
GemmBiasTransKernelArg
),
hipMemcpyHostToDevice
,
hipMemcpyHostToDevice
,
stream_config
.
stream_id_
));
stream_config
.
stream_id_
));
float
ave_time
=
0
;
float
ave_time
=
0
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
View file @
8d2f2f8c
...
@@ -421,11 +421,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
...
@@ -421,11 +421,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
}
}
hip_check_error
(
hip_check_error
(
hipMemcpy
WithStream
(
arg
.
p_workspace_
,
hipMemcpy
Async
(
arg
.
p_workspace_
,
arg
.
gemm_kernel_args_
.
data
(),
arg
.
gemm_kernel_args_
.
data
(),
arg
.
gemm_kernel_args_
.
size
()
*
sizeof
(
GemmTransKernelArg
),
arg
.
gemm_kernel_args_
.
size
()
*
sizeof
(
GemmTransKernelArg
),
hipMemcpyHostToDevice
,
hipMemcpyHostToDevice
,
stream_config
.
stream_id_
));
stream_config
.
stream_id_
));
float
ave_time
=
0
;
float
ave_time
=
0
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
View file @
8d2f2f8c
...
@@ -38,8 +38,7 @@ __global__ void
...
@@ -38,8 +38,7 @@ __global__ void
// __attribute__((amdgpu_waves_per_eu(1, 1)))
// __attribute__((amdgpu_waves_per_eu(1, 1)))
kernel_gemm_xdl_cshuffle_v3
(
typename
GridwiseGemm
::
Argument
karg
)
kernel_gemm_xdl_cshuffle_v3
(
typename
GridwiseGemm
::
Argument
karg
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
...
...
include/ck/utility/amd_buffer_addressing.hpp
View file @
8d2f2f8c
...
@@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
bhalf_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
bhalf_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
int32_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
int32_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
f8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
f8_fnuz_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
bf8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
bf8_fnuz_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
fp8_storage_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
int8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
)),
(
is_same
<
T
,
int8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
)),
"wrong! not implemented"
);
"wrong! not implemented"
);
...
@@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
...
@@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
#else
#else
vector_t
tmp
=
amd_buffer_load_impl
<
scalar_t
,
vector_size
,
coherence
>
(
vector_t
tmp
{
amd_buffer_load_impl
<
scalar_t
,
vector_size
,
coherence
>
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
0
)
}
;
return
src_thread_element_valid
?
tmp
:
vector_t
(
0
);
return
src_thread_element_valid
?
tmp
:
vector_t
(
0
);
#endif
#endif
}
}
...
@@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
...
@@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
constexpr
index_t
vector_size
=
scalar_type
<
vector_t
>::
vector_size
;
constexpr
index_t
vector_size
=
scalar_type
<
vector_t
>::
vector_size
;
vector_t
tmp
=
amd_buffer_load_impl
<
scalar_t
,
vector_size
,
coherence
>
(
vector_t
tmp
{
amd_buffer_load_impl
<
scalar_t
,
vector_size
,
coherence
>
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
0
)
}
;
return
src_thread_element_valid
?
tmp
:
vector_t
(
customized_value
);
return
src_thread_element_valid
?
tmp
:
vector_t
(
customized_value
);
}
}
...
...
include/ck/utility/amd_ck_fp8.hpp
0 → 100644
View file @
8d2f2f8c
This diff is collapsed.
Click to expand it.
include/ck/utility/amd_xdlops.hpp
View file @
8d2f2f8c
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#pragma once
#pragma once
namespace
ck
{
namespace
ck
{
// Define the common macro for
gfx94x
models
// Define the common macro for
MI300
models
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#define __gfx94__
#define __gfx94__
#endif
#endif
...
...
include/ck/utility/data_type.hpp
View file @
8d2f2f8c
This diff is collapsed.
Click to expand it.
include/ck/utility/math_v2.hpp
View file @
8d2f2f8c
...
@@ -80,7 +80,7 @@ static inline __host__ bool isnan(half_t x)
...
@@ -80,7 +80,7 @@ static inline __host__ bool isnan(half_t x)
return
(
xx
&
0x7FFF
)
>
0x7C00
;
return
(
xx
&
0x7FFF
)
>
0x7C00
;
};
};
static
inline
__host__
bool
isnan
(
f8_t
x
)
{
return
(
x
&
0x80
);
};
static
inline
__host__
bool
isnan
(
f8_t
x
)
{
return
ck
::
fp8_is_nan
(
x
);
};
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
static
inline
__host__
bool
isnan
(
int4_t
x
)
static
inline
__host__
bool
isnan
(
int4_t
x
)
...
@@ -531,7 +531,7 @@ static inline __device__ bool isnan(half_t x)
...
@@ -531,7 +531,7 @@ static inline __device__ bool isnan(half_t x)
return
(
xx
&
0x7FFF
)
>
0x7C00
;
return
(
xx
&
0x7FFF
)
>
0x7C00
;
};
};
static
inline
__device__
bool
isnan
(
f8_t
x
)
{
return
(
x
&
0x80
);
};
static
inline
__device__
bool
isnan
(
f8_t
x
)
{
return
ck
::
fp8_is_nan
(
x
);
};
static
inline
__device__
half_t
sqrt
(
half_t
x
)
static
inline
__device__
half_t
sqrt
(
half_t
x
)
{
{
...
...
include/ck/utility/random_gen.hpp
View file @
8d2f2f8c
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "ck/ck.hpp"
namespace
ck
{
namespace
ck
{
// Pseudo random number generator
// Pseudo random number generator
...
@@ -23,7 +25,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
...
@@ -23,7 +25,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
}
}
// version for fp16
// version for fp16
template
<
typename
T
,
uint32_t
seed_t
,
std
::
enable_if_t
<
std
::
is_same
<
half_t
,
T
>{},
bool
>
=
false
>
template
<
typename
T
,
uint32_t
seed_t
,
std
::
enable_if_t
<
std
::
is_same
<
_Float16
,
T
>{},
bool
>
=
false
>
__host__
__device__
uint32_t
prand_generator
(
index_t
id
,
T
val
,
uint32_t
seed
=
seed_t
)
__host__
__device__
uint32_t
prand_generator
(
index_t
id
,
T
val
,
uint32_t
seed
=
seed_t
)
{
{
uint16_t
x
=
*
(
reinterpret_cast
<
uint16_t
*>
(
&
val
));
uint16_t
x
=
*
(
reinterpret_cast
<
uint16_t
*>
(
&
val
));
...
@@ -38,9 +40,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
...
@@ -38,9 +40,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
}
}
// return 0 if data is not fp16 or fp32
// return 0 if data is not fp16 or fp32
template
<
typename
T
,
template
<
uint32_t
seed_t
,
typename
T
,
std
::
enable_if_t
<!
(
std
::
is_same
<
float
,
T
>{}
||
std
::
is_same
<
half_t
,
T
>
{}),
bool
>
=
false
>
uint32_t
seed_t
,
std
::
enable_if_t
<!
(
std
::
is_same
<
float
,
T
>{}
||
std
::
is_same
<
_Float16
,
T
>
{}),
bool
>
=
false
>
__host__
__device__
uint32_t
prand_generator
(
int
id
,
T
val
,
uint32_t
seed
=
seed_t
)
__host__
__device__
uint32_t
prand_generator
(
int
id
,
T
val
,
uint32_t
seed
=
seed_t
)
{
{
std
::
ignore
=
id
;
std
::
ignore
=
id
;
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment