Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ea5be216
"vscode:/vscode.git/clone" did not exist on "37b359b2bdb1ec094f9edc458f2389b9d483a960"
Commit
ea5be216
authored
Aug 23, 2024
by
Jun Liu
Browse files
Merge branch 'develop' into amd-develop
parents
e2eb0418
25935b57
Changes
168
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1110 additions
and
59 deletions
+1110
-59
CMakeLists.txt
CMakeLists.txt
+23
-12
Jenkinsfile
Jenkinsfile
+37
-6
client_example/07_grouped_convnd_fwd/CMakeLists.txt
client_example/07_grouped_convnd_fwd/CMakeLists.txt
+3
-3
client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
+4
-2
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+5
-2
client_example/16_convnd_fwd/CMakeLists.txt
client_example/16_convnd_fwd/CMakeLists.txt
+1
-1
client_example/20_splitk_gemm/CMakeLists.txt
client_example/20_splitk_gemm/CMakeLists.txt
+1
-1
client_example/24_grouped_conv_activation/CMakeLists.txt
client_example/24_grouped_conv_activation/CMakeLists.txt
+20
-4
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
...activation/grouped_convnd_fwd_convscale_reduce/common.hpp
+834
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
...nd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
+58
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
...d_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
+58
-0
client_example/CMakeLists.txt
client_example/CMakeLists.txt
+11
-2
codegen/CMakeLists.txt
codegen/CMakeLists.txt
+2
-2
codegen/test/CMakeLists.txt
codegen/test/CMakeLists.txt
+16
-12
codegen/test/rtc/CMakeLists.txt
codegen/test/rtc/CMakeLists.txt
+0
-2
docs/sphinx/requirements.in
docs/sphinx/requirements.in
+1
-1
docs/sphinx/requirements.txt
docs/sphinx/requirements.txt
+1
-1
example/01_gemm/gemm_xdl_fp8.cpp
example/01_gemm/gemm_xdl_fp8.cpp
+2
-2
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+5
-5
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+28
-1
No files found.
CMakeLists.txt
View file @
ea5be216
...
@@ -62,8 +62,14 @@ if (DTYPES)
...
@@ -62,8 +62,14 @@ if (DTYPES)
endif
()
endif
()
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
else
()
else
()
add_definitions
(
-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16
)
add_definitions
(
-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8
)
set
(
CK_ENABLE_ALL_DTYPES
"ON"
)
set
(
CK_ENABLE_INT8
"ON"
)
set
(
CK_ENABLE_FP16
"ON"
)
set
(
CK_ENABLE_FP32
"ON"
)
set
(
CK_ENABLE_FP64
"ON"
)
set
(
CK_ENABLE_BF16
"ON"
)
set
(
CK_ENABLE_FP8
"ON"
)
set
(
CK_ENABLE_BF8
"ON"
)
endif
()
endif
()
#for f8/bf8_t type
#for f8/bf8_t type
...
@@ -182,12 +188,18 @@ endif()
...
@@ -182,12 +188,18 @@ endif()
configure_file
(
include/ck/config.h.in
${
CMAKE_CURRENT_BINARY_DIR
}
/include/ck/config.h
)
configure_file
(
include/ck/config.h.in
${
CMAKE_CURRENT_BINARY_DIR
}
/include/ck/config.h
)
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
GREATER 500723302
)
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
GREATER 500723302
)
message
(
"Adding the fno-offload-uniform-block compiler flag"
)
check_cxx_compiler_flag
(
"-fno-offload-uniform-block"
HAS_NO_OFFLOAD_UNIFORM_BLOCK
)
add_compile_options
(
-fno-offload-uniform-block
)
if
(
HAS_NO_OFFLOAD_UNIFORM_BLOCK
)
message
(
"Adding the fno-offload-uniform-block compiler flag"
)
add_compile_options
(
-fno-offload-uniform-block
)
endif
()
endif
()
endif
()
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
GREATER 600140090
)
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
GREATER 600140090
)
message
(
"Adding the enable-post-misched=0 compiler flag"
)
check_cxx_compiler_flag
(
"-mllvm -enable-post-misched=0"
HAS_ENABLE_POST_MISCHED
)
add_compile_options
(
"SHELL: -mllvm -enable-post-misched=0"
)
if
(
HAS_ENABLE_POST_MISCHED
)
message
(
"Adding the enable-post-misched=0 compiler flag"
)
add_compile_options
(
"SHELL: -mllvm -enable-post-misched=0"
)
endif
()
endif
()
endif
()
set
(
check-coerce
)
set
(
check-coerce
)
check_cxx_compiler_flag
(
" -mllvm -amdgpu-coerce-illegal-types=1"
check-coerce
)
check_cxx_compiler_flag
(
" -mllvm -amdgpu-coerce-illegal-types=1"
check-coerce
)
...
@@ -541,12 +553,7 @@ if(NOT DEFINED INSTANCES_ONLY)
...
@@ -541,12 +553,7 @@ if(NOT DEFINED INSTANCES_ONLY)
PACKAGE_NAME examples
PACKAGE_NAME examples
)
)
add_subdirectory
(
example
)
add_subdirectory
(
example
)
if
(
GPU_TARGETS MATCHES
"gfx9"
AND NOT INSTANCES_ONLY
)
add_subdirectory
(
test
)
add_subdirectory
(
codegen
)
endif
()
if
(
BUILD_TESTING
)
add_subdirectory
(
test
)
endif
()
rocm_package_setup_component
(
profiler
rocm_package_setup_component
(
profiler
LIBRARY_NAME composablekernel
LIBRARY_NAME composablekernel
...
@@ -563,6 +570,10 @@ if(NOT DEFINED INSTANCES_ONLY)
...
@@ -563,6 +570,10 @@ if(NOT DEFINED INSTANCES_ONLY)
endif
()
endif
()
endif
()
endif
()
if
(
NOT DEFINED PROFILER_ONLY
AND
(
GPU_TARGETS MATCHES
"gfx9"
OR DEFINED INSTANCES_ONLY
))
add_subdirectory
(
codegen
)
endif
()
#Create an interface target for the include only files and call it "composablekernels"
#Create an interface target for the include only files and call it "composablekernels"
include
(
CMakePackageConfigHelpers
)
include
(
CMakePackageConfigHelpers
)
...
...
Jenkinsfile
View file @
ea5be216
...
@@ -426,8 +426,9 @@ def runCKProfiler(Map conf=[:]){
...
@@ -426,8 +426,9 @@ def runCKProfiler(Map conf=[:]){
archiveArtifacts
"perf_resnet50_N4.log"
archiveArtifacts
"perf_resnet50_N4.log"
archiveArtifacts
"perf_batched_gemm.log"
archiveArtifacts
"perf_batched_gemm.log"
archiveArtifacts
"perf_grouped_gemm.log"
archiveArtifacts
"perf_grouped_gemm.log"
archiveArtifacts
"perf_conv_fwd.log"
archiveArtifacts
"perf_grouped_conv_fwd.log"
archiveArtifacts
"perf_conv_bwd_data.log"
archiveArtifacts
"perf_grouped_conv_bwd_data.log"
archiveArtifacts
"perf_grouped_conv_bwd_weight.log"
archiveArtifacts
"perf_gemm_bilinear.log"
archiveArtifacts
"perf_gemm_bilinear.log"
archiveArtifacts
"perf_reduction.log"
archiveArtifacts
"perf_reduction.log"
archiveArtifacts
"perf_splitK_gemm.log"
archiveArtifacts
"perf_splitK_gemm.log"
...
@@ -439,8 +440,9 @@ def runCKProfiler(Map conf=[:]){
...
@@ -439,8 +440,9 @@ def runCKProfiler(Map conf=[:]){
stash
name:
"perf_resnet50_N4.log"
stash
name:
"perf_resnet50_N4.log"
stash
name:
"perf_batched_gemm.log"
stash
name:
"perf_batched_gemm.log"
stash
name:
"perf_grouped_gemm.log"
stash
name:
"perf_grouped_gemm.log"
stash
name:
"perf_conv_fwd.log"
stash
name:
"perf_grouped_conv_fwd.log"
stash
name:
"perf_conv_bwd_data.log"
stash
name:
"perf_grouped_conv_bwd_data.log"
stash
name:
"perf_grouped_conv_bwd_weight.log"
stash
name:
"perf_gemm_bilinear.log"
stash
name:
"perf_gemm_bilinear.log"
stash
name:
"perf_reduction.log"
stash
name:
"perf_reduction.log"
stash
name:
"perf_splitK_gemm.log"
stash
name:
"perf_splitK_gemm.log"
...
@@ -648,8 +650,9 @@ def process_results(Map conf=[:]){
...
@@ -648,8 +650,9 @@ def process_results(Map conf=[:]){
unstash
"perf_resnet50_N4.log"
unstash
"perf_resnet50_N4.log"
unstash
"perf_batched_gemm.log"
unstash
"perf_batched_gemm.log"
unstash
"perf_grouped_gemm.log"
unstash
"perf_grouped_gemm.log"
unstash
"perf_conv_fwd.log"
unstash
"perf_grouped_conv_fwd.log"
unstash
"perf_conv_bwd_data.log"
unstash
"perf_grouped_conv_bwd_data.log"
unstash
"perf_grouped_conv_bwd_weight.log"
unstash
"perf_gemm_bilinear.log"
unstash
"perf_gemm_bilinear.log"
unstash
"perf_reduction.log"
unstash
"perf_reduction.log"
unstash
"perf_splitK_gemm.log"
unstash
"perf_splitK_gemm.log"
...
@@ -746,6 +749,10 @@ pipeline {
...
@@ -746,6 +749,10 @@ pipeline {
name:
"RUN_PERFORMANCE_TESTS"
,
name:
"RUN_PERFORMANCE_TESTS"
,
defaultValue:
true
,
defaultValue:
true
,
description:
"Run the performance tests (default: ON)"
)
description:
"Run the performance tests (default: ON)"
)
booleanParam
(
name:
"RUN_GROUPED_CONV_LARGE_CASES_TESTS"
,
defaultValue:
false
,
description:
"Run the grouped conv large cases tests (default: OFF)"
)
booleanParam
(
booleanParam
(
name:
"RUN_CK_TILE_TESTS"
,
name:
"RUN_CK_TILE_TESTS"
,
defaultValue:
false
,
defaultValue:
false
,
...
@@ -837,6 +844,30 @@ pipeline {
...
@@ -837,6 +844,30 @@ pipeline {
}
}
}
}
}
}
stage
(
"Run Grouped Conv Large Case Tests"
)
{
parallel
{
stage
(
"Run Grouped Conv Large Case Tests on gfx90a"
)
{
when
{
beforeAgent
true
expression
{
params
.
RUN_GROUPED_CONV_LARGE_CASES_TESTS
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx90a"
)}
environment
{
setup_args
=
"NO_CK_BUILD"
execute_args
=
""" ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
./bin/test_grouped_convnd_fwd_large_cases_xdl"""
}
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
}
}
}
}
stage
(
"Run CK_TILE Tests"
)
stage
(
"Run CK_TILE Tests"
)
{
{
parallel
parallel
...
...
client_example/07_grouped_convnd_fwd/CMakeLists.txt
View file @
ea5be216
...
@@ -5,17 +5,17 @@ if(GPU_TARGETS MATCHES "gfx9")
...
@@ -5,17 +5,17 @@ if(GPU_TARGETS MATCHES "gfx9")
add_executable
(
client_grouped_conv1d_fwd grouped_conv1d_fwd.cpp
)
add_executable
(
client_grouped_conv1d_fwd grouped_conv1d_fwd.cpp
)
target_link_libraries
(
client_grouped_conv1d_fwd PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv1d_fwd PRIVATE composable_kernel::device_conv_operations
)
if
((
DTYPES MATCHES
"fp8"
)
OR NOT DEFINED DTYPES
)
if
((
DTYPES MATCHES
"fp8"
)
OR
(
NOT DEFINED DTYPES
AND GPU_TARGETS MATCHES
"gfx94"
)
)
add_executable
(
client_grouped_conv3d_fwd_fp8 grouped_conv3d_fwd_fp8.cpp
)
add_executable
(
client_grouped_conv3d_fwd_fp8 grouped_conv3d_fwd_fp8.cpp
)
target_link_libraries
(
client_grouped_conv3d_fwd_fp8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_fwd_fp8 PRIVATE composable_kernel::device_conv_operations
)
endif
()
endif
()
if
((
DTYPES MATCHES
"bf8"
)
OR NOT DEFINED DTYPES
)
if
((
DTYPES MATCHES
"bf8"
)
OR
(
NOT DEFINED DTYPES
AND GPU_TARGETS MATCHES
"gfx94"
)
)
add_executable
(
client_grouped_conv3d_fwd_bf8 grouped_conv3d_fwd_bf8.cpp
)
add_executable
(
client_grouped_conv3d_fwd_bf8 grouped_conv3d_fwd_bf8.cpp
)
target_link_libraries
(
client_grouped_conv3d_fwd_bf8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_fwd_bf8 PRIVATE composable_kernel::device_conv_operations
)
endif
()
endif
()
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"bf8"
)
OR NOT DEFINED DTYPES
)
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"bf8"
)
OR
(
NOT DEFINED DTYPES
AND GPU_TARGETS MATCHES
"gfx94"
)
)
add_executable
(
client_grouped_conv3d_fwd_fp8_bf8 grouped_conv3d_fwd_fp8_bf8.cpp
)
add_executable
(
client_grouped_conv3d_fwd_fp8_bf8 grouped_conv3d_fwd_fp8_bf8.cpp
)
target_link_libraries
(
client_grouped_conv3d_fwd_fp8_bf8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_fwd_fp8_bf8 PRIVATE composable_kernel::device_conv_operations
)
...
...
client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
View file @
ea5be216
...
@@ -4,5 +4,7 @@ target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::
...
@@ -4,5 +4,7 @@ target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::
add_executable
(
client_grouped_conv3d_bwd_data grouped_conv3d_bwd_data.cpp
)
add_executable
(
client_grouped_conv3d_bwd_data grouped_conv3d_bwd_data.cpp
)
target_link_libraries
(
client_grouped_conv3d_bwd_data PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_bwd_data PRIVATE composable_kernel::device_conv_operations
)
add_executable
(
client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp
)
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"bf8"
)
OR
(
NOT DEFINED DTYPES AND GPU_TARGETS MATCHES
"gfx94"
))
target_link_libraries
(
client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 PRIVATE composable_kernel::device_conv_operations
)
add_executable
(
client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp
)
target_link_libraries
(
client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 PRIVATE composable_kernel::device_conv_operations
)
endif
()
\ No newline at end of file
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
View file @
ea5be216
...
@@ -2,10 +2,13 @@ add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_f
...
@@ -2,10 +2,13 @@ add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_f
add_executable
(
client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp
)
add_executable
(
client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp
)
add_executable
(
client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp
)
add_executable
(
client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp
)
add_executable
(
client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp
)
add_executable
(
client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp
)
add_executable
(
client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp
)
target_link_libraries
(
client_grouped_conv1d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv1d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 PRIVATE composable_kernel::device_conv_operations
)
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"bf8"
)
OR
(
NOT DEFINED DTYPES AND GPU_TARGETS MATCHES
"gfx94"
))
add_executable
(
client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 PRIVATE composable_kernel::device_conv_operations
)
endif
()
\ No newline at end of file
client_example/16_convnd_fwd/CMakeLists.txt
View file @
ea5be216
...
@@ -4,7 +4,7 @@ if((DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
...
@@ -4,7 +4,7 @@ if((DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
endif
()
endif
()
if
((
DTYPES MATCHES
"fp8"
)
OR NOT DEFINED DTYPES
)
if
((
DTYPES MATCHES
"fp8"
)
OR
(
NOT DEFINED DTYPES
AND GPU_TARGETS MATCHES
"gfx94"
)
)
add_executable
(
client_conv3d_fwd_fp16_comp_fp8 conv3d_fwd_fp16_comp_fp8.cpp
)
add_executable
(
client_conv3d_fwd_fp16_comp_fp8 conv3d_fwd_fp16_comp_fp8.cpp
)
target_link_libraries
(
client_conv3d_fwd_fp16_comp_fp8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_conv3d_fwd_fp16_comp_fp8 PRIVATE composable_kernel::device_conv_operations
)
endif
()
endif
()
...
...
client_example/20_splitk_gemm/CMakeLists.txt
View file @
ea5be216
if
(
GPU_TARGETS MATCHES
"gfx9"
AND
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
))
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR
(
NOT DEFINED DTYPES
AND GPU_TARGETS MATCHES
"gfx94"
))
add_executable
(
client_splitK_gemm splitK_gemm_fp16_f8.cpp
)
add_executable
(
client_splitK_gemm splitK_gemm_fp16_f8.cpp
)
target_link_libraries
(
client_splitK_gemm PRIVATE composable_kernel::device_gemm_operations
)
target_link_libraries
(
client_splitK_gemm PRIVATE composable_kernel::device_gemm_operations
)
endif
()
endif
()
client_example/24_grouped_conv_activation/CMakeLists.txt
View file @
ea5be216
if
(
GPU_TARGETS MATCHES
"gfx9"
)
if
(
GPU_TARGETS MATCHES
"gfx9"
)
# Fwd scaleadd scaleadd relu
# Fwd scaleadd scaleadd relu
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
)
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations
)
...
@@ -36,7 +36,7 @@ add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
...
@@ -36,7 +36,7 @@ add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp
)
grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations
)
# Fwd convinvscale
# Fwd convinvscale
add_executable
(
client_conv3d_fwd_convinvscale_fp8
add_executable
(
client_conv3d_fwd_convinvscale_fp8
grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp
)
grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp
)
target_link_libraries
(
client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations
)
# Fwd convscale + Bias
# Fwd convscale + Bias
...
@@ -47,6 +47,22 @@ target_link_libraries(client_conv3d_fwd_convscale_add_fp8 PRIVATE composable_ker
...
@@ -47,6 +47,22 @@ target_link_libraries(client_conv3d_fwd_convscale_add_fp8 PRIVATE composable_ker
add_executable
(
client_conv3d_fwd_convscale_relu_fp8
add_executable
(
client_conv3d_fwd_convscale_relu_fp8
grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp
)
grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp
)
target_link_libraries
(
client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_kernel::device_conv_operations
)
# Fwd convscale + ReLU + AMAX
add_executable
(
client_conv3d_fwd_convscale_relu_amax_fp8
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
)
target_link_libraries
(
client_conv3d_fwd_convscale_relu_amax_fp8
PRIVATE composable_kernel::device_conv_operations
composable_kernel::device_other_operations
composable_kernel::device_reduction_operations
utility
)
# Fwd convscale + AMAX
add_executable
(
client_conv3d_fwd_convscale_amax_fp8
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
)
target_link_libraries
(
client_conv3d_fwd_convscale_amax_fp8
PRIVATE composable_kernel::device_conv_operations
composable_kernel::device_other_operations
composable_kernel::device_reduction_operations
utility
)
# Fwd convscale
# Fwd convscale
add_executable
(
client_conv3d_fwd_convscale_fp8
add_executable
(
client_conv3d_fwd_convscale_fp8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp
)
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp
)
...
@@ -56,11 +72,11 @@ add_executable(client_conv3d_fwd_convscale_bf8
...
@@ -56,11 +72,11 @@ add_executable(client_conv3d_fwd_convscale_bf8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8.cpp
)
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8.cpp
)
target_link_libraries
(
client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel::device_conv_operations
)
add_executable
(
client_conv3d_fwd_convscale_fp8_bf8
add_executable
(
client_conv3d_fwd_convscale_fp8_bf8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp
)
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp
)
target_link_libraries
(
client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations
)
add_executable
(
client_conv3d_fwd_convscale_bf8_fp8
add_executable
(
client_conv3d_fwd_convscale_bf8_fp8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp
)
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp
)
target_link_libraries
(
client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations
)
target_link_libraries
(
client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations
)
# Bwd data bilinear
# Bwd data bilinear
...
...
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
0 → 100644
View file @
ea5be216
This diff is collapsed.
Click to expand it.
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
0 → 100644
View file @
ea5be216
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
CShuffleDataType
=
float
;
using
ConvOutDataType
=
float
;
// data type of convolution result
using
OutDataType
=
ck
::
f8_t
;
// data type of final result
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
using
ConvElementOp
=
ConvScale
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AMAX
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
64
;
static
constexpr
ck
::
index_t
K
=
128
;
static
constexpr
ck
::
index_t
C
=
64
;
static
constexpr
ck
::
index_t
Z
=
3
;
static
constexpr
ck
::
index_t
Y
=
3
;
static
constexpr
ck
::
index_t
X
=
3
;
static
constexpr
ck
::
index_t
Di
=
28
;
static
constexpr
ck
::
index_t
Hi
=
28
;
static
constexpr
ck
::
index_t
Wi
=
3
;
static
constexpr
ck
::
index_t
Do
=
28
;
static
constexpr
ck
::
index_t
Ho
=
28
;
static
constexpr
ck
::
index_t
Wo
=
3
;
int
main
()
{
return
run_grouped_conv_fwd_convscale_reduce
<
NumDimSpatial
,
InDataType
,
WeiDataType
,
ConvOutDataType
,
OutDataType
,
ConvElementOp
,
ReduceOpId
,
InLayout
,
WeiLayout
,
OutLayout
,
3
,
AComputeDataType
,
BComputeDataType
>
(
{
N
,
Di
,
Hi
,
Wi
,
G
,
C
},
{
G
,
K
,
Z
,
Y
,
X
,
C
},
{
N
,
Do
,
Ho
,
Wo
,
G
,
K
})
?
EXIT_SUCCESS
:
EXIT_FAILURE
;
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
0 → 100644
View file @
ea5be216
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
CShuffleDataType
=
float
;
using
ConvOutDataType
=
float
;
// data type of convolution result
using
OutDataType
=
ck
::
f8_t
;
// data type of final result
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
using
ConvElementOp
=
ConvScaleRelu
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AMAX
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
64
;
static
constexpr
ck
::
index_t
K
=
128
;
static
constexpr
ck
::
index_t
C
=
64
;
static
constexpr
ck
::
index_t
Z
=
3
;
static
constexpr
ck
::
index_t
Y
=
3
;
static
constexpr
ck
::
index_t
X
=
3
;
static
constexpr
ck
::
index_t
Di
=
28
;
static
constexpr
ck
::
index_t
Hi
=
28
;
static
constexpr
ck
::
index_t
Wi
=
3
;
static
constexpr
ck
::
index_t
Do
=
28
;
static
constexpr
ck
::
index_t
Ho
=
28
;
static
constexpr
ck
::
index_t
Wo
=
3
;
int
main
()
{
return
run_grouped_conv_fwd_convscale_reduce
<
NumDimSpatial
,
InDataType
,
WeiDataType
,
ConvOutDataType
,
OutDataType
,
ConvElementOp
,
ReduceOpId
,
InLayout
,
WeiLayout
,
OutLayout
,
3
,
AComputeDataType
,
BComputeDataType
>
(
{
N
,
Di
,
Hi
,
Wi
,
G
,
C
},
{
G
,
K
,
Z
,
Y
,
X
,
C
},
{
N
,
Do
,
Ho
,
Wo
,
G
,
K
})
?
EXIT_SUCCESS
:
EXIT_FAILURE
;
}
client_example/CMakeLists.txt
View file @
ea5be216
...
@@ -34,8 +34,17 @@ if (DTYPES)
...
@@ -34,8 +34,17 @@ if (DTYPES)
endif
()
endif
()
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
message
(
"DTYPES macro set to
${
DTYPES
}
"
)
else
()
else
()
add_definitions
(
-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16
)
add_definitions
(
-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16
)
set
(
CK_ENABLE_ALL_DTYPES
"ON"
)
set
(
CK_ENABLE_INT8
"ON"
)
set
(
CK_ENABLE_FP16
"ON"
)
set
(
CK_ENABLE_FP32
"ON"
)
set
(
CK_ENABLE_FP64
"ON"
)
set
(
CK_ENABLE_BF16
"ON"
)
if
(
GPU_TARGETS MATCHES
"gfx94"
)
add_definitions
(
-DCK_ENABLE_FP8 -DCK_ENABLE_BF8
)
set
(
CK_ENABLE_FP8
"ON"
)
set
(
CK_ENABLE_BF8
"ON"
)
endif
()
endif
()
endif
()
if
(
GPU_TARGETS
)
if
(
GPU_TARGETS
)
...
...
codegen/CMakeLists.txt
View file @
ea5be216
...
@@ -27,6 +27,8 @@ file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
...
@@ -27,6 +27,8 @@ file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
add_embed_library
(
ck_headers
${
KERNEL_FILES
}
RELATIVE
${
CK_ROOT
}
/include
)
add_embed_library
(
ck_headers
${
KERNEL_FILES
}
RELATIVE
${
CK_ROOT
}
/include
)
file
(
GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp
)
file
(
GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp
)
##message(STATUS "SOURCE_FILES: ${SOURCES}")
# TODO: Use object library
# TODO: Use object library
add_library
(
ck_host STATIC
${
SOURCES
}
)
add_library
(
ck_host STATIC
${
SOURCES
}
)
target_link_libraries
(
ck_host PRIVATE ck_headers
)
target_link_libraries
(
ck_host PRIVATE ck_headers
)
...
@@ -48,6 +50,4 @@ rocm_install(
...
@@ -48,6 +50,4 @@ rocm_install(
)
)
rocm_install
(
DIRECTORY include/ck DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
rocm_install
(
DIRECTORY include/ck DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
if
(
BUILD_TESTING
)
add_subdirectory
(
test
)
add_subdirectory
(
test
)
endif
()
codegen/test/CMakeLists.txt
View file @
ea5be216
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm
)
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm
)
add_subdirectory
(
rtc
)
add_subdirectory
(
rtc
)
file
(
GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp
)
file
(
GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp
)
foreach
(
TEST_SRC
${
TEST_SRCS
}
)
if
(
NOT INSTANCES_ONLY
)
set_source_files_properties
(
${
TEST_SRC
}
PROPERTIES LANGUAGE HIP
)
foreach
(
TEST_SRC
${
TEST_SRCS
}
)
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
set_source_files_properties
(
${
TEST_SRC
}
PROPERTIES LANGUAGE HIP
)
add_executable
(
test_host_
${
BASE_NAME
}
${
TEST_SRC
}
)
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
add_dependencies
(
codegen test_host_
${
BASE_NAME
}
)
add_executable
(
codegen_test_
${
BASE_NAME
}
${
TEST_SRC
}
)
add_test
(
NAME codegen_test_
${
BASE_NAME
}
COMMAND test_host_
${
BASE_NAME
}
)
add_dependencies
(
codegen codegen_test_
${
BASE_NAME
}
)
target_link_libraries
(
test_host_
${
BASE_NAME
}
ck_rtc ck_host
)
add_dependencies
(
tests codegen_test_
${
BASE_NAME
}
)
# target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a)
add_dependencies
(
check codegen_test_
${
BASE_NAME
}
)
target_include_directories
(
test_host_
${
BASE_NAME
}
PUBLIC
include
())
add_test
(
NAME codegen_test_
${
BASE_NAME
}
COMMAND codegen_test_
${
BASE_NAME
}
)
target_include_directories
(
test_host_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/include
)
message
(
"adding test codegen_test_
${
BASE_NAME
}
"
)
target_include_directories
(
test_host_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/library/include
)
target_link_libraries
(
codegen_test_
${
BASE_NAME
}
ck_rtc ck_host
)
endforeach
()
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/codegen/test/include
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/include
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/library/include
)
endforeach
()
endif
()
codegen/test/rtc/CMakeLists.txt
View file @
ea5be216
find_package
(
hip
)
file
(
GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp
)
file
(
GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp
)
add_library
(
ck_rtc
${
RTC_SOURCES
}
)
add_library
(
ck_rtc
${
RTC_SOURCES
}
)
target_include_directories
(
ck_rtc PUBLIC include
)
target_include_directories
(
ck_rtc PUBLIC include
)
...
...
docs/sphinx/requirements.in
View file @
ea5be216
rocm-docs-core==1.
6
.2
rocm-docs-core==1.
7
.2
sphinxcontrib-bibtex==2.6.2
sphinxcontrib-bibtex==2.6.2
docs/sphinx/requirements.txt
View file @
ea5be216
...
@@ -103,7 +103,7 @@ requests==2.32.3
...
@@ -103,7 +103,7 @@ requests==2.32.3
# via
# via
# pygithub
# pygithub
# sphinx
# sphinx
rocm-docs-core==1.
6
.2
rocm-docs-core==1.
7
.2
# via -r requirements.in
# via -r requirements.in
six==1.16.0
six==1.16.0
# via pybtex
# via pybtex
...
...
example/01_gemm/gemm_xdl_fp8.cpp
View file @
ea5be216
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
using
ADataType
=
ck
::
f8_t
;
using
ADataType
=
ck
::
f8_t
;
using
BDataType
=
ck
::
f8_t
;
using
BDataType
=
ck
::
f8_t
;
using
CDataType
=
ck
::
hal
f_t
;
using
CDataType
=
ck
::
f
8
_t
;
using
AccDataType
=
float
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
CShuffleDataType
=
float
;
...
...
example/01_gemm/run_gemm_example.inc
View file @
ea5be216
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -34,11 +34,11 @@ inline __host__ __device__ constexpr double get_rtol()
...
@@ -34,11 +34,11 @@ inline __host__ __device__ constexpr double get_rtol()
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
{
return
1
e
-
1
;
// 240 and 224 are acceptable
return
2
e
-
1
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
{
return
1.5
e-1
;
// 57344 and 49152 are acceptable
return
2
e
-
1
;
}
}
else
else
{
{
...
@@ -75,11 +75,11 @@ inline __host__ __device__ constexpr double get_atol()
...
@@ -75,11 +75,11 @@ inline __host__ __device__ constexpr double get_atol()
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
{
return
16.1
;
// 240 and 224 are acceptable
return
2
e
-
1
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
{
return
8192.1
;
// 57344 and 49152 are acceptable
return
2
e
-
1
;
}
}
else
else
{
{
...
...
example/12_reduce/reduce_blockwise.cpp
View file @
ea5be216
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <initializer_list>
#include <initializer_list>
...
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
...
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
else
else
{
{
// for testing half_t
// for testing half_t
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
&&
reduce_blockwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing float
// for testing float
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing double
// for testing double
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing bhalf_t
// for testing bhalf_t
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
bhalf_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
bhalf_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
reduce_blockwise_test
<
ck
::
bhalf_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing int8_t
// for testing int8_t
pass
=
pass
&&
reduce_blockwise_test
<
int8_t
,
int32_t
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
=
pass
&&
reduce_blockwise_test
<
int8_t
,
int32_t
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
&&
reduce_blockwise_test
<
int8_t
,
int32_t
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
// for testing int4_t using AVG operation
// for testing int4_t using AVG operation
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int32_t
,
ReduceTensorOp
::
AVG
,
false
,
false
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int32_t
,
ReduceTensorOp
::
AVG
,
false
,
false
>
(
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int32_t
,
ReduceTensorOp
::
AVG
,
false
,
false
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing int4_t using MAX operation
// for testing int4_t using MAX operation
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int8_t
,
ReduceTensorOp
::
MAX
,
false
,
false
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int8_t
,
ReduceTensorOp
::
MAX
,
false
,
false
>
(
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int8_t
,
ReduceTensorOp
::
MAX
,
false
,
false
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
#endif
#endif
...
...
Prev
1
2
3
4
5
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment