Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7977f89d
Commit
7977f89d
authored
Nov 07, 2024
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into ck_tile/moe_quant
parents
45131629
3599418a
Changes
17
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
90 additions
and
77 deletions
+90
-77
CMakeLists.txt
CMakeLists.txt
+4
-4
Jenkinsfile
Jenkinsfile
+3
-3
example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
...se_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+3
-2
example/CMakeLists.txt
example/CMakeLists.txt
+4
-4
example/ck_tile/01_fmha/generate.py
example/ck_tile/01_fmha/generate.py
+3
-0
example/ck_tile/02_layernorm2d/generate.py
example/ck_tile/02_layernorm2d/generate.py
+1
-1
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
...vice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+6
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
...device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+12
-12
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
.../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+6
-6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
...ation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+12
-12
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
...k/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+4
-4
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+4
-4
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+5
-5
library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+0
-5
profiler/include/profiler/profile_pool3d_fwd_impl.hpp
profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+16
-2
profiler/src/profile_layernorm_fwd.cpp
profiler/src/profile_layernorm_fwd.cpp
+1
-1
test/CMakeLists.txt
test/CMakeLists.txt
+6
-6
No files found.
CMakeLists.txt
View file @
7977f89d
...
@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}")
...
@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}")
message
(
"checking which targets are supported"
)
message
(
"checking which targets are supported"
)
#In order to build just the CK library (without tests and examples) for all supported GPU targets
#In order to build just the CK library (without tests and examples) for all supported GPU targets
#use -D GPU_ARCHS="gfx908;gfx90a;
gfx940;gfx941;
gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
#the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
#the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
#
#
#In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
#In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
if
(
NOT ENABLE_ASAN_PACKAGING
)
if
(
NOT ENABLE_ASAN_PACKAGING
)
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
LESS 600300000
)
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
LESS 600300000
)
# WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
# WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
set
(
CK_GPU_TARGETS
"gfx908;gfx90a;
gfx940;gfx941;
gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
)
set
(
CK_GPU_TARGETS
"gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
)
else
()
else
()
set
(
CK_GPU_TARGETS
"gfx908;gfx90a;
gfx940;gfx941;
gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
)
set
(
CK_GPU_TARGETS
"gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
)
endif
()
endif
()
else
()
else
()
#build CK only for xnack-supported targets when using ASAN
#build CK only for xnack-supported targets when using ASAN
set
(
CK_GPU_TARGETS
"gfx908:xnack+;gfx90a:xnack+;
gfx940:xnack+;gfx941:xnack+;
gfx942:xnack+"
)
set
(
CK_GPU_TARGETS
"gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+"
)
endif
()
endif
()
#if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
#if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
...
...
Jenkinsfile
View file @
7977f89d
...
@@ -1101,11 +1101,11 @@ pipeline {
...
@@ -1101,11 +1101,11 @@ pipeline {
agent
{
label
rocmnode
(
"gfx90a"
)
}
agent
{
label
rocmnode
(
"gfx90a"
)
}
environment
{
environment
{
setup_args
=
""" -DCMAKE_INSTALL_PREFIX=../install \
setup_args
=
""" -DCMAKE_INSTALL_PREFIX=../install \
-DGPU_TARGETS="gfx908;gfx90a;
gfx940;gfx941;
gfx942" \
-DGPU_TARGETS="gfx908;gfx90a;gfx942" \
-DCMAKE_CXX_FLAGS=" -O3 " """
-DCMAKE_CXX_FLAGS=" -O3 " """
execute_args
=
""" cd ../client_example && rm -rf build && mkdir build && cd build && \
execute_args
=
""" cd ../client_example && rm -rf build && mkdir build && cd build && \
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-DGPU_TARGETS="gfx908;gfx90a;
gfx940;gfx941;
gfx942" \
-DGPU_TARGETS="gfx908;gfx90a;gfx942" \
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
}
}
...
@@ -1165,7 +1165,7 @@ pipeline {
...
@@ -1165,7 +1165,7 @@ pipeline {
execute_args
=
""" cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
execute_args
=
""" cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER="${build_compiler()}" \
-D CMAKE_CXX_COMPILER="${build_compiler()}" \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_ARCHS="gfx908;gfx90a;
gfx940;gfx941;
gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
}
}
steps
{
steps
{
...
...
example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
View file @
7977f89d
...
@@ -68,7 +68,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
...
@@ -68,7 +68,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
using
DeviceReduceInstance
=
using
DeviceReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceReduceMultiBlock
<
OutputDataType
,
ck
::
tensor_operation
::
device
::
DeviceReduceMultiBlock
<
OutputDataType
,
Output
DataType
,
Scale
DataType
,
OutputDataType
,
OutputDataType
,
NumDim
,
NumDim
,
NumDim
,
NumDim
,
...
@@ -108,7 +108,8 @@ void reference_scale_permute_amax(Tensor<InputDataType>& input,
...
@@ -108,7 +108,8 @@ void reference_scale_permute_amax(Tensor<InputDataType>& input,
host_output_scaled_casted_transposed
(
m
,
k
)
=
y1
;
host_output_scaled_casted_transposed
(
m
,
k
)
=
y1
;
const
OutputDataType
y_fabs
=
const
OutputDataType
y_fabs
=
ck
::
type_convert
<
OutputDataType
>
(
ck
::
math
::
abs
(
ck
::
type_convert
<
float
>
(
y0
)));
ck
::
type_convert
<
OutputDataType
>
(
ck
::
math
::
abs
(
ck
::
type_convert
<
float
>
(
y0
)));
host_output_amax
(
0
)
=
ck
::
math
::
max
(
y_fabs
,
host_output_amax
(
0
));
host_output_amax
(
0
)
=
ck
::
type_convert
<
OutputDataType
>
(
ck
::
math
::
max
(
ck
::
type_convert
<
float
>
(
y_fabs
),
ck
::
type_convert
<
float
>
(
host_output_amax
(
0
))));
}
}
}
}
}
}
...
...
example/CMakeLists.txt
View file @
7977f89d
...
@@ -85,9 +85,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
...
@@ -85,9 +85,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
#only continue if there are some source files left on the list
#only continue if there are some source files left on the list
if
(
FILE_NAME
)
if
(
FILE_NAME
)
if
(
FILE_NAME MATCHES
"_xdl"
)
if
(
FILE_NAME MATCHES
"_xdl"
)
list
(
REMOVE_ITEM EX_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM EX_TARGETS
gfx900 gfx906 gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
elseif
(
FILE_NAME MATCHES
"_wmma"
)
elseif
(
FILE_NAME MATCHES
"_wmma"
)
list
(
REMOVE_ITEM EX_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
list
(
REMOVE_ITEM EX_TARGETS
gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
endif
()
endif
()
set_source_files_properties
(
${
FILE_NAME
}
PROPERTIES LANGUAGE HIP
)
set_source_files_properties
(
${
FILE_NAME
}
PROPERTIES LANGUAGE HIP
)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
...
@@ -169,9 +169,9 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
...
@@ -169,9 +169,9 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
#only continue if there are some source files left on the list
#only continue if there are some source files left on the list
if
(
FILE_NAME
)
if
(
FILE_NAME
)
if
(
FILE_NAME MATCHES
"_xdl"
)
if
(
FILE_NAME MATCHES
"_xdl"
)
list
(
REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM EX_TARGETS gfx900 gfx906
gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
elseif
(
FILE_NAME MATCHES
"_wmma"
)
elseif
(
FILE_NAME MATCHES
"_wmma"
)
list
(
REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
list
(
REMOVE_ITEM EX_TARGETS gfx900 gfx906
gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
endif
()
endif
()
set_source_files_properties
(
${
FILE_NAME
}
PROPERTIES LANGUAGE HIP
)
set_source_files_properties
(
${
FILE_NAME
}
PROPERTIES LANGUAGE HIP
)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
...
...
example/ck_tile/01_fmha/generate.py
View file @
7977f89d
...
@@ -47,6 +47,9 @@ def list_blobs(output_file : Optional[str], api_list : List[str], kernel_filter
...
@@ -47,6 +47,9 @@ def list_blobs(output_file : Optional[str], api_list : List[str], kernel_filter
assert
output_file
is
not
None
assert
output_file
is
not
None
file_path
=
Path
(
output_file
)
file_path
=
Path
(
output_file
)
# create an empty file / drop its contents if it exists
open
(
file_path
,
"w"
).
close
()
for
api
in
api_list
:
for
api
in
api_list
:
handler
=
handlers
[
api
][
HandlerId
.
LIST_BLOBS
]
handler
=
handlers
[
api
][
HandlerId
.
LIST_BLOBS
]
handler
(
file_path
,
kernel_filter
,
receipt
,
mask_impl
)
handler
(
file_path
,
kernel_filter
,
receipt
,
mask_impl
)
...
...
example/ck_tile/02_layernorm2d/generate.py
View file @
7977f89d
...
@@ -559,7 +559,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
...
@@ -559,7 +559,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
w_p
=
Path
(
self
.
working_path
)
w_p
=
Path
(
self
.
working_path
)
list_p
=
w_p
/
'layernorm2d_fwd_blobs.txt'
list_p
=
w_p
/
'layernorm2d_fwd_blobs.txt'
blobs
=
self
.
get_blobs
()
blobs
=
self
.
get_blobs
()
with
list_p
.
open
(
'
a
'
)
as
list_f
:
with
list_p
.
open
(
'
w
'
)
as
list_f
:
# api related file
# api related file
list_f
.
write
(
str
(
w_p
/
(
self
.
name_api
+
".cpp"
))
+
"
\n
"
)
list_f
.
write
(
str
(
w_p
/
(
self
.
name_api
+
".cpp"
))
+
"
\n
"
)
list_f
.
write
(
str
(
w_p
/
(
self
.
name_common_header
+
".hpp"
))
+
"
\n
"
)
list_f
.
write
(
str
(
w_p
/
(
self
.
name_common_header
+
".hpp"
))
+
"
\n
"
)
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
View file @
7977f89d
...
@@ -93,12 +93,12 @@ __global__ void
...
@@ -93,12 +93,12 @@ __global__ void
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
long_index_t
b_batch_offset
=
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
const
long_index_t
e_batch_offset
=
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
View file @
7977f89d
...
@@ -60,12 +60,12 @@ __global__ void
...
@@ -60,12 +60,12 @@ __global__ void
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
*
NumGroupsToMerge
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
*
NumGroupsToMerge
);
const
index_t
k_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
num_k_per_block
);
const
index_t
k_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
num_k_per_block
);
const
long_index_t
a_batch_offset
=
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
long_index_t
b_batch_offset
=
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
const
long_index_t
e_batch_offset
=
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
...
@@ -117,12 +117,12 @@ __global__ void
...
@@ -117,12 +117,12 @@ __global__ void
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
*
NumGroupsToMerge
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
*
NumGroupsToMerge
);
const
index_t
k_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
num_k_per_block
);
const
index_t
k_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
num_k_per_block
);
const
long_index_t
a_batch_offset
=
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
long_index_t
b_batch_offset
=
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
const
long_index_t
e_batch_offset
=
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
// Pass two lds pointer is the key to tell compiler that ds_read/write
// Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy
// operate on different lds chunk at same time without order dependecy
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
View file @
7977f89d
...
@@ -98,12 +98,12 @@ __global__ void
...
@@ -98,12 +98,12 @@ __global__ void
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
long_index_t
b_batch_offset
=
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
const
long_index_t
c_batch_offset
=
const
long_index_t
c_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
View file @
7977f89d
...
@@ -60,12 +60,12 @@ __global__ void
...
@@ -60,12 +60,12 @@ __global__ void
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
long_index_t
b_batch_offset
=
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
const
long_index_t
e_batch_offset
=
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
...
@@ -155,12 +155,12 @@ __global__ void
...
@@ -155,12 +155,12 @@ __global__ void
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
long_index_t
b_batch_offset
=
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
const
long_index_t
e_batch_offset
=
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
...
...
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
View file @
7977f89d
...
@@ -121,10 +121,10 @@ struct GridwiseTensorRearrange
...
@@ -121,10 +121,10 @@ struct GridwiseTensorRearrange
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
// Global Memory
// Global Memory
const
index_t
a_batch_offset
=
const
index_t
a_batch_offset
=
__builtin_amdgcn_readfirstlane
(
__builtin_amdgcn_readfirstlane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
index_t
c_batch_offset
=
const
index_t
c_batch_offset
=
__builtin_amdgcn_readfirstlane
(
__builtin_amdgcn_readfirstlane
(
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
));
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
))
)
;
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
+
a_batch_offset
,
in_grid_desc
.
GetElementSpaceSize
());
p_in_global
+
a_batch_offset
,
in_grid_desc
.
GetElementSpaceSize
());
...
...
library/include/ck/library/utility/check_err.hpp
View file @
7977f89d
...
@@ -24,7 +24,7 @@ namespace ck {
...
@@ -24,7 +24,7 @@ namespace ck {
namespace
utils
{
namespace
utils
{
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
double
get_relative_threshold
(
const
int
number
OfA
ccumulations
=
1
)
double
get_relative_threshold
(
const
int
number
_of_a
ccumulations
=
1
)
{
{
using
F8
=
ck
::
f8_t
;
using
F8
=
ck
::
f8_t
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
...
@@ -79,13 +79,13 @@ double get_relative_threshold(const int numberOfAccumulations = 1)
...
@@ -79,13 +79,13 @@ double get_relative_threshold(const int numberOfAccumulations = 1)
}
}
else
else
{
{
acc_error
=
std
::
pow
(
2
,
-
NumericUtils
<
AccDataType
>::
mant
)
*
0.5
*
number
OfA
ccumulations
;
acc_error
=
std
::
pow
(
2
,
-
NumericUtils
<
AccDataType
>::
mant
)
*
0.5
*
number
_of_a
ccumulations
;
}
}
return
std
::
max
(
acc_error
,
midway_error
);
return
std
::
max
(
acc_error
,
midway_error
);
}
}
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
double
get_absolute_threshold
(
const
double
max_possible_num
,
const
int
number
OfA
ccumulations
=
1
)
double
get_absolute_threshold
(
const
double
max_possible_num
,
const
int
number
_of_a
ccumulations
=
1
)
{
{
using
F8
=
ck
::
f8_t
;
using
F8
=
ck
::
f8_t
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
...
@@ -142,7 +142,7 @@ double get_absolute_threshold(const double max_possible_num, const int numberOfA
...
@@ -142,7 +142,7 @@ double get_absolute_threshold(const double max_possible_num, const int numberOfA
else
else
{
{
acc_error
=
acc_error
=
std
::
pow
(
2
,
expo
-
NumericUtils
<
AccDataType
>::
mant
)
*
0.5
*
number
OfA
ccumulations
;
std
::
pow
(
2
,
expo
-
NumericUtils
<
AccDataType
>::
mant
)
*
0.5
*
number
_of_a
ccumulations
;
}
}
return
std
::
max
(
acc_error
,
midway_error
);
return
std
::
max
(
acc_error
,
midway_error
);
}
}
...
...
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
7977f89d
...
@@ -88,19 +88,19 @@ function(add_instance_library INSTANCE_NAME)
...
@@ -88,19 +88,19 @@ function(add_instance_library INSTANCE_NAME)
foreach
(
source IN LISTS ARGN
)
foreach
(
source IN LISTS ARGN
)
set
(
INST_TARGETS
${
SUPPORTED_GPU_TARGETS
}
)
set
(
INST_TARGETS
${
SUPPORTED_GPU_TARGETS
}
)
if
(
source MATCHES
"_xdl"
)
if
(
source MATCHES
"_xdl"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906
gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
elseif
(
source MATCHES
"_wmma"
)
elseif
(
source MATCHES
"_wmma"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906
gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
elseif
(
source MATCHES
"mha"
)
elseif
(
source MATCHES
"mha"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906
gfx906:xnack- gfx908:xnack- gfx908:xnack+
gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
endif
()
endif
()
#only build the fp8 gemm instances for gfx908/90a if the build argument is set
#only build the fp8 gemm instances for gfx908/90a if the build argument is set
if
(
NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH
)
if
(
NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH
)
if
(
source MATCHES
"gemm_xdl_universal"
AND source MATCHES
"f8"
)
if
(
source MATCHES
"gemm_xdl_universal"
AND source MATCHES
"f8"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906
gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
endif
()
endif
()
if
(
source MATCHES
"gemm_multiply_multiply_f8"
)
if
(
source MATCHES
"gemm_multiply_multiply_f8"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906
gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
endif
()
endif
()
endif
()
endif
()
set
(
offload_targets
)
set
(
offload_targets
)
...
...
library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
View file @
7977f89d
...
@@ -27,11 +27,6 @@ rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops)
...
@@ -27,11 +27,6 @@ rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops)
# headers for building lib
# headers for building lib
file
(
COPY
${
MHA_HEADERS
}
DESTINATION
${
FMHA_CPP_FOLDER
}
)
file
(
COPY
${
MHA_HEADERS
}
DESTINATION
${
FMHA_CPP_FOLDER
}
)
# Delete the blob file if it exists to avoid append of old content.
if
(
EXISTS
${
FMHA_CPP_FOLDER
}
/blob_list.txt
)
file
(
REMOVE
${
FMHA_CPP_FOLDER
}
/blob_list.txt
)
endif
()
set
(
FMHA_KNOWN_APIS
"fwd,fwd_splitkv,fwd_appendkv,bwd"
)
set
(
FMHA_KNOWN_APIS
"fwd,fwd_splitkv,fwd_appendkv,bwd"
)
# generate a list of kernels, but not actually emit files at config stage
# generate a list of kernels, but not actually emit files at config stage
...
...
profiler/include/profiler/profile_pool3d_fwd_impl.hpp
View file @
7977f89d
...
@@ -240,6 +240,19 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
...
@@ -240,6 +240,19 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
{
{
out_device_buf
.
FromDevice
(
out_n_c_do_ho_wo_device
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_n_c_do_ho_wo_device
.
mData
.
data
());
auto
number_of_accumulations
=
1
;
static_assert
(
ReduceOpId
==
ck
::
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ck
::
ReduceTensorOp
::
MAX
,
"Warning: Unhandled ReduceOpId for setting up the number of accumulations!"
);
if
constexpr
(
ReduceOpId
==
ck
::
ReduceTensorOp
::
AVG
)
{
for
(
size_t
i
=
0
;
i
<
kernel_params
.
window_spatial_lengths
.
size
();
++
i
)
{
number_of_accumulations
*=
kernel_params
.
window_spatial_lengths
.
at
(
i
);
}
}
auto
absolute_error_threshold
=
1.0
;
auto
absolute_error_threshold
=
1.0
;
switch
(
in_params
.
init_method
)
switch
(
in_params
.
init_method
)
{
{
...
@@ -250,9 +263,10 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
...
@@ -250,9 +263,10 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
absolute_error_threshold
=
absolute_error_threshold
=
ck
::
utils
::
get_absolute_threshold
<
ComputeDataType
,
OutDataType
>
(
ck
::
utils
::
get_absolute_threshold
<
ComputeDataType
,
OutDataType
>
(
absolute_error_threshold
);
absolute_error_threshold
,
number_of_accumulations
);
auto
relative_error_threshold
=
auto
relative_error_threshold
=
ck
::
utils
::
get_relative_threshold
<
ComputeDataType
,
OutDataType
>
();
ck
::
utils
::
get_relative_threshold
<
ComputeDataType
,
OutDataType
>
(
number_of_accumulations
);
bool
pass
=
ck
::
utils
::
check_err
(
out_n_c_do_ho_wo_device
.
mData
,
bool
pass
=
ck
::
utils
::
check_err
(
out_n_c_do_ho_wo_device
.
mData
,
out_n_c_do_ho_wo_host
.
mData
,
out_n_c_do_ho_wo_host
.
mData
,
...
...
profiler/src/profile_layernorm_fwd.cpp
View file @
7977f89d
...
@@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[])
...
@@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[])
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
{
ck
::
profiler
::
profile_layernorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F
32
,
false
,
rank
>
(
ck
::
profiler
::
profile_layernorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F
16
,
false
,
rank
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
}
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
...
...
test/CMakeLists.txt
View file @
7977f89d
...
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
...
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
#only continue if there are some source files left on the list
#only continue if there are some source files left on the list
if
(
ARGN
)
if
(
ARGN
)
if
(
ARGN MATCHES
"_xdl"
)
if
(
ARGN MATCHES
"_xdl"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
elseif
(
ARGN MATCHES
"_wmma"
)
elseif
(
ARGN MATCHES
"_wmma"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
elseif
(
ARGN MATCHES
"_smfmac"
)
elseif
(
ARGN MATCHES
"_smfmac"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
)
endif
()
endif
()
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
...
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
...
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
#only continue if there are some source files left on the list
#only continue if there are some source files left on the list
if
(
ARGN
)
if
(
ARGN
)
if
(
ARGN MATCHES
"_xdl"
)
if
(
ARGN MATCHES
"_xdl"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906
gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
elseif
(
ARGN MATCHES
"_wmma"
)
elseif
(
ARGN MATCHES
"_wmma"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906
gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
elseif
(
ARGN MATCHES
"_smfmac"
)
elseif
(
ARGN MATCHES
"_smfmac"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
)
endif
()
endif
()
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment