Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7f65ac05
Commit
7f65ac05
authored
Apr 04, 2024
by
Jun Liu
Browse files
Merge branch 'develop' into amd-develop
parents
687d2b7e
7e5c81fe
Changes
234
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
718 additions
and
555 deletions
+718
-555
example/26_contraction/CMakeLists.txt
example/26_contraction/CMakeLists.txt
+16
-16
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+1
-4
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+17
-34
example/31_batched_gemm_gemm/CMakeLists.txt
example/31_batched_gemm_gemm/CMakeLists.txt
+6
-14
example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+6
-8
example/35_splitK_gemm/CMakeLists.txt
example/35_splitK_gemm/CMakeLists.txt
+17
-26
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+7
-24
example/40_conv2d_fwd_quantization/CMakeLists.txt
example/40_conv2d_fwd_quantization/CMakeLists.txt
+16
-23
example/41_grouped_conv_conv_fwd/CMakeLists.txt
example/41_grouped_conv_conv_fwd/CMakeLists.txt
+6
-14
example/44_elementwise_permute/CMakeLists.txt
example/44_elementwise_permute/CMakeLists.txt
+2
-0
example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
...ple/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
+140
-0
example/44_elementwise_permute/elementwise_permute.cpp
example/44_elementwise_permute/elementwise_permute.cpp
+25
-42
example/44_elementwise_permute/elementwise_permute_3d.cpp
example/44_elementwise_permute/elementwise_permute_3d.cpp
+23
-28
example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
...le/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+40
-39
example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
...44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+23
-33
example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
...4_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
+56
-69
example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
...4_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
+53
-57
example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
...4_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
+56
-67
example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
...4_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
+52
-57
example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
...le/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
+156
-0
No files found.
example/26_contraction/CMakeLists.txt
View file @
7f65ac05
...
...
@@ -4,49 +4,49 @@ add_custom_target(example_contraction_bilinear)
# FP32
add_example_executable
(
example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32
)
add_example_executable
(
example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32_compute_bf16 contraction_bilinear_xdl_fp32_compute_bf16.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_bf16
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_bf16
)
add_example_executable
(
example_contraction_scale_xdl_fp32_compute_bf16 contraction_scale_xdl_fp32_compute_bf16.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_bf16
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_bf16
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32_compute_fp16 contraction_bilinear_xdl_fp32_compute_fp16.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_fp16
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_fp16
)
add_example_executable
(
example_contraction_scale_xdl_fp32_compute_fp16 contraction_scale_xdl_fp32_compute_fp16.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_fp16
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_fp16
)
# FP64
add_example_executable
(
example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64
)
add_example_executable
(
example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64
)
add_example_executable
(
example_contraction_bilinear_xdl_fp64_compute_fp32 contraction_bilinear_xdl_fp64_compute_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64_compute_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64_compute_fp32
)
add_example_executable
(
example_contraction_scale_xdl_fp64_compute_fp32 contraction_scale_xdl_fp64_compute_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64_compute_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64_compute_fp32
)
# FP16
add_example_executable
(
example_contraction_bilinear_xdl_fp16_compute_fp32 contraction_bilinear_xdl_fp16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp16_compute_fp32
)
add_example_executable
(
example_contraction_scale_xdl_fp16_compute_fp32 contraction_scale_xdl_fp16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp16_compute_fp32
)
# BF16
add_example_executable
(
example_contraction_bilinear_xdl_bf16_compute_fp32 contraction_bilinear_xdl_bf16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_bf16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_bf16_compute_fp32
)
add_example_executable
(
example_contraction_scale_xdl_bf16_compute_fp32 contraction_scale_xdl_bf16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_bf16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_bf16_compute_fp32
)
add_dependencies
(
example_contraction example_contraction_scale
)
add_dependencies
(
example_contraction example_contraction_bilinear
)
add_
example_
dependencies
(
example_contraction example_contraction_scale
)
add_
example_
dependencies
(
example_contraction example_contraction_bilinear
)
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
View file @
7f65ac05
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx11"
)
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
endif
()
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
View file @
7f65ac05
list
(
APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942
)
list
(
APPEND gpu_list2 gfx1100 gfx1101 gfx1102
)
add_custom_target
(
example_grouped_conv_fwd_multiple_d
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_custom_target
(
example_grouped_conv_fwd_multiple_d
)
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp
16
grouped_conv_fwd_bias_relu_add_xdl_fp
16
.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp
16
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp
32
grouped_conv_fwd_bias_relu_add_xdl_fp
32
.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp
32
)
add_example_executable
(
example_grouped_conv_fwd_xdl_f
p
16 grouped_conv_fwd_xdl_f
p
16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_f
p
16
)
add_example_executable
(
example_grouped_conv_fwd_
bias_relu_add_
xdl_
b
f16 grouped_conv_fwd_
bias_relu_add_
xdl_
b
f16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_
bias_relu_add_
xdl_
b
f16
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_
fp32
grouped_conv_fwd_bias_relu_add_xdl_
fp32
.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_
fp32
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_
int8
grouped_conv_fwd_bias_relu_add_xdl_
int8
.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_
int8
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
set
(
target 1
)
endif
()
endforeach
()
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list2 AND target EQUAL 0
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
)
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
)
example/31_batched_gemm_gemm/CMakeLists.txt
View file @
7f65ac05
list
(
APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
if
(
NOT GPU_TARGETS MATCHES
"gfx94"
AND NOT GPU_TARGETS MATCHES
"gfx1"
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp
)
...
...
example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
View file @
7f65ac05
if
(
GPU_TARGETS MATCHES
"gfx11"
)
add_example_executable
(
example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_cross_attention_forward_wmma_fp16 cross_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_multi_query_attention_forward_wmma_fp16 multi_query_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_query_attention_forward_wmma_fp16 grouped_query_attention_forward_wmma_fp16.cpp
)
endif
()
add_example_executable
(
example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_cross_attention_forward_wmma_fp16 cross_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_multi_query_attention_forward_wmma_fp16 multi_query_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_query_attention_forward_wmma_fp16 grouped_query_attention_forward_wmma_fp16.cpp
)
add_custom_target
(
example_gemm_scale_softmax_gemm
)
...
...
example/35_splitK_gemm/CMakeLists.txt
View file @
7f65ac05
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_splitK_gemm_xdl
)
add_custom_target
(
example_splitK_gemm_xdl
)
add_example_executable
(
example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp32
)
add_example_executable
(
example_splitK_gemm_xdl_fp
32
splitK_gemm_xdl_fp
32
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp
32
)
add_example_executable
(
example_splitK_gemm_xdl_fp
16
splitK_gemm_xdl_fp
16
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp
16
)
add_example_executable
(
example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16
)
add_example_executable
(
example_splitK_gemm_xdl_fp16
_fp8
splitK_gemm_xdl_fp16
_fp8
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16
_fp8
)
add_example_executable
(
example_splitK_gemm_xdl_
fp16_fp8 splitK_gemm_xdl_fp16_fp8
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16
_fp8
)
add_example_executable
(
example_splitK_gemm_xdl_
lds_direct_load_fp16 splitK_gemm_xdl_lds_direct_load_fp16
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
lds_direct_load_
fp16
)
add_example_executable
(
example_splitK_gemm_xdl_
lds_direct_load_fp
16 splitK_gemm_xdl_
lds_direct_load_fp
16.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
lds_direct_load_fp
16
)
add_example_executable
(
example_splitK_gemm_xdl_
bf
16 splitK_gemm_xdl_
bf
16.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
bf
16
)
add_example_executable
(
example_splitK_gemm_xdl_
bf16
splitK_gemm_xdl_
bf16
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
bf16
)
add_example_executable
(
example_splitK_gemm_xdl_
int8
splitK_gemm_xdl_
int8
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
int8
)
add_example_executable
(
example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int8
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int4
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int4
)
endif
()
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
View file @
7f65ac05
list
(
APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942
)
list
(
APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list_xdl AND target EQUAL 0
)
add_custom_target
(
example_grouped_conv_bwd_data
)
add_custom_target
(
example_grouped_conv_bwd_data
)
add_example_executable
(
example_grouped_conv_bwd_data_xdl_fp16 grouped_conv_bwd_data_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_xdl_fp16
)
add_example_executable
(
example_grouped_conv_bwd_data_xdl_fp16 grouped_conv_bwd_data_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_xdl_fp16
)
add_example_executable
(
example_grouped_conv_bwd_data_bias_relu_xdl_fp16 grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_xdl_fp16
)
add_example_executable
(
example_grouped_conv_bwd_data_bias_relu_xdl_fp16 grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_xdl_fp16
)
set
(
target 1
)
endif
()
endforeach
()
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list_wmma AND target EQUAL 0
)
add_custom_target
(
example_grouped_conv_bwd_data
)
add_example_executable
(
example_grouped_conv_bwd_data_wmma_fp16 grouped_conv_bwd_data_wmma_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_wmma_fp16
)
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_grouped_conv_bwd_data_wmma_fp16 grouped_conv_bwd_data_wmma_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_wmma_fp16
)
example/40_conv2d_fwd_quantization/CMakeLists.txt
View file @
7f65ac05
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
# Conv perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp
)
# Conv + bias + relu perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
)
# Conv + bias + relu perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv + bias + tanh perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
# Conv + bias + tanh perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
# Conv perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
# Conv perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp
)
# Conv + bias + relu perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
)
# Conv + bias + relu perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv + bias + tanh perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
# Conv + bias + tanh perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
example/41_grouped_conv_conv_fwd/CMakeLists.txt
View file @
7f65ac05
list
(
APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942
)
list
(
APPEND gpu_list2 gfx908 gfx90a
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
if
(
NOT GPU_TARGETS MATCHES
"gfx94"
AND NOT GPU_TARGETS MATCHES
"gfx1"
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp
)
...
...
example/44_elementwise_permute/CMakeLists.txt
View file @
7f65ac05
...
...
@@ -4,6 +4,8 @@ add_example_executable(example_elementwise_permute_4D_fp32_row elementwise_permu
add_example_executable
(
example_elementwise_permute_4D_fp16_row elementwise_permute_4D_fp16_row.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp32_col elementwise_permute_4D_fp32_col.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp
)
add_example_executable
(
example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_permute elementwise_permute.cpp
)
if
((
NOT GPU_TARGETS MATCHES
"gfx940"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx941"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx942"
))
add_example_executable
(
example_elementwise_permute_3d elementwise_permute_3d.cpp
)
...
...
example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
0 → 100644
View file @
7f65ac05
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
UnaryScale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
UnarySquare
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
UnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
UnaryCombinedOp
<
UnarySquare
,
UnaryScale
>
;
using
BinaryAdd
=
ck
::
tensor_operation
::
element_wise
::
Add
;
// B = alpha * A0 * A0 + beta * A1 * A1
using
BinaryAddUnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
BinaryWithUnaryCombinedOp
<
BinaryAdd
,
UnaryScaleSquare
,
UnaryScaleSquare
>
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
,
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
BinaryAddUnaryScaleSquare
,
// ElementwiseOp
4
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
,
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
>>
;
// OutScalarPerVectorSeq
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
ab_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
3
]),
1
};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
2
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
ab_strides
),
Tensor
<
ADataType
>
(
ab_lengths
,
ab_strides
)};
Tensor
<
ADataType
>&
a0
=
as
[
0
];
Tensor
<
ADataType
>&
a1
=
as
[
1
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
ab_strides
);
float
alpha
=
3.
f
;
float
beta
=
2.
f
;
a0
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
a1
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a0_device_buf
(
sizeof
(
ADataType
)
*
a0
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
a1_device_buf
(
sizeof
(
ADataType
)
*
a1
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a0_device_buf
.
ToDevice
(
a0
.
mData
.
data
());
a1_device_buf
.
ToDevice
(
a1
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
inputs
=
{
a0_device_buf
.
GetDeviceBuffer
(),
a1_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
unary_scale_op_a0
=
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
alpha
}};
auto
unary_scale_op_a1
=
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
beta
}};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
ab_strides
,
ab_strides
},
{
ab_strides
},
inputs
,
output
,
BinaryAddUnaryScaleSquare
{
BinaryAdd
{},
unary_scale_op_a0
,
unary_scale_op_a1
});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A0 (nchw): "
<<
a0
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"A1 (nchw): "
<<
a1
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nchw): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
5
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
ab_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
2
,
ADataType
,
BDataType
,
BinaryAddUnaryScaleSquare
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
BinaryAddUnaryScaleSquare
{
BinaryAdd
{},
unary_scale_op_a0
,
unary_scale_op_a1
});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
example/44_elementwise_permute/elementwise_permute.cpp
View file @
7f65ac05
...
...
@@ -8,6 +8,8 @@
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -30,20 +32,6 @@ using DeviceElementwisePermuteInstance =
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_ndhwc
,
const
HostTensorA
&
A_ncdhw
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_ncdhw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_ncdhw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
d
=
0
;
d
<
A_ncdhw
.
mDesc
.
GetLengths
()[
2
];
++
d
)
for
(
std
::
size_t
h
=
0
;
h
<
A_ncdhw
.
mDesc
.
GetLengths
()[
3
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_ncdhw
.
mDesc
.
GetLengths
()[
4
];
++
w
)
{
auto
a_val
=
A_ncdhw
(
n
,
c
,
d
,
h
,
w
);
functor
(
B_ndhwc
(
n
,
d
,
h
,
w
,
c
),
a_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
...
...
@@ -51,32 +39,7 @@ int main()
std
::
vector
<
std
::
size_t
>
ncdhw
=
{
16
,
8
,
8
,
8
,
8
};
std
::
vector
<
std
::
size_t
>
ndhwc
=
{
16
,
8
,
8
,
8
,
8
};
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
BDataType
>
b
(
ndhwc
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
;
/**std::array<ck::index_t, 5> a_strides = {
static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[4]),
1};
std::array<ck::index_t, 5> b_strides = {
static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
1,
static_cast<int>(ndhwc[3] * ndhwc[4]),
static_cast<int>(ndhwc[4])};**/
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
static_cast
<
int
>
(
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
]),
...
...
@@ -93,6 +56,20 @@ int main()
1
};
ck
::
ranges
::
copy
(
ncdhw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
...
...
@@ -126,10 +103,16 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
ndhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{});
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
PassThrough
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_permute_3d.cpp
View file @
7f65ac05
...
...
@@ -8,6 +8,8 @@
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -34,20 +36,6 @@ using DeviceElementwisePermuteInstance =
ck
::
Sequence
<
4
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
4
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_ndhwc
,
const
HostTensorA
&
A_ncdhw
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_ncdhw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_ncdhw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
d
=
0
;
d
<
A_ncdhw
.
mDesc
.
GetLengths
()[
2
];
++
d
)
for
(
std
::
size_t
h
=
0
;
h
<
A_ncdhw
.
mDesc
.
GetLengths
()[
3
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_ncdhw
.
mDesc
.
GetLengths
()[
4
];
++
w
)
{
auto
a_val
=
A_ncdhw
(
n
,
c
,
d
,
h
,
w
);
functor
(
B_ndhwc
(
n
,
d
,
h
,
w
,
c
),
a_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
...
...
@@ -59,10 +47,13 @@ int main()
const
int
W
=
5
;
const
int
D
=
16
;
std
::
vector
<
std
::
size_t
>
ncdhw
=
{
N
,
C
,
D
,
H
,
W
};
std
::
vector
<
std
::
size_t
>
ndhwc
=
{
N
,
D
,
H
,
W
,
C
};
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
BDataType
>
b
(
ndhwc
);
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
{
N
,
C
,
H
,
W
,
D
};
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
C
*
D
*
H
*
W
,
H
*
W
,
W
,
1
,
D
*
H
*
W
};
// N, C, D, H, W
std
::
array
<
ck
::
index_t
,
5
>
b_strides
=
{
C
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
// N, D, H, W, C
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
...
...
@@ -74,10 +65,6 @@ int main()
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
{
N
,
C
,
H
,
W
,
D
};
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
C
*
D
*
H
*
W
,
H
*
W
,
W
,
1
,
D
*
H
*
W
};
// N, C, D, H, W
std
::
array
<
ck
::
index_t
,
5
>
b_strides
=
{
C
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
// N, D, H, W, C
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
...
...
@@ -94,11 +81,12 @@ int main()
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
];
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
ab_lengths
[
0
]
*
ab_lengths
[
1
]
*
ab_lengths
[
2
]
*
ab_lengths
[
3
]
*
ab_lengths
[
4
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
])
+
sizeof
(
BDataType
)
*
(
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
]);
(
sizeof
(
ADataType
)
+
sizeof
(
BDataType
))
*
(
ab_lengths
[
0
]
*
ab_lengths
[
1
]
*
ab_lengths
[
2
]
*
ab_lengths
[
3
]
*
ab_lengths
[
4
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
@@ -111,10 +99,17 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
ndhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{});
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
PassThrough
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
View file @
7f65ac05
...
...
@@ -6,7 +6,9 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
@@ -20,28 +22,20 @@ using F32 = float;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// Elementwise op
4
,
// NumDim
8
,
// MPerThread
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_nchw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_nchw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
A_nchw
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_nchw
.
mDesc
.
GetLengths
()[
3
];
++
w
)
{
auto
a_val
=
A_nchw
(
n
,
c
,
h
,
w
);
functor
(
B_nhwc
(
n
,
h
,
w
,
c
),
a_val
);
}
}
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// Elementwise
4
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
>>
;
// OutScalarPerVectorSeq
int
main
()
{
...
...
@@ -50,18 +44,6 @@ int main()
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
128
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
...
...
@@ -72,9 +54,22 @@ int main()
1
,
static_cast
<
int
>
(
nhwc
[
2
]
*
nhwc
[
3
]),
static_cast
<
int
>
(
nhwc
[
3
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
...
...
@@ -106,10 +101,16 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{});
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
PassThrough
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
View file @
7f65ac05
...
...
@@ -8,6 +8,8 @@
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
...
@@ -30,22 +32,6 @@ using DeviceElementwisePermuteInstance =
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
const
std
::
vector
<
std
::
size_t
>&
shape_nchw
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
shape_nchw
[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
shape_nchw
[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
shape_nchw
[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
shape_nchw
[
3
];
++
w
)
{
auto
a_val
=
A_nchw
(
n
,
c
,
h
,
w
);
functor
(
B_nhwc
(
n
,
h
,
w
,
c
),
a_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
...
...
@@ -54,13 +40,16 @@ int main()
const
int
N
=
120
;
const
int
C
=
128
;
const
int
H
=
32
;
const
int
W
=
1024
;
const
int
W
=
32
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
N
,
C
,
H
,
W
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
N
,
H
,
W
,
C
};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
{
N
,
H
,
W
,
C
};
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
C
*
H
*
W
,
W
,
1
,
H
*
W
};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
H
*
W
*
C
,
W
*
C
,
C
,
1
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
...
...
@@ -72,11 +61,6 @@ int main()
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
{
N
,
H
,
W
,
C
};
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
C
*
H
*
W
,
W
,
1
,
H
*
W
};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
H
*
W
*
C
,
W
*
C
,
C
,
1
};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
...
...
@@ -94,10 +78,11 @@ int main()
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
ab_lengths
[
0
]
*
ab_lengths
[
1
]
*
ab_lengths
[
2
]
*
ab_lengths
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
std
::
size_t
num_btype
=
(
sizeof
(
ADataType
)
+
sizeof
(
BDataType
))
*
(
ab_lengths
[
0
]
*
ab_lengths
[
1
]
*
ab_lengths
[
2
]
*
ab_lengths
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
@@ -110,11 +95,16 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
PassThrough
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
<
Tensor
<
ADataType
>
,
Tensor
<
BDataType
>
,
PassThrough
>
(
host_b
,
a
,
nchw
,
PassThrough
{});
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
View file @
7f65ac05
...
...
@@ -6,8 +6,10 @@
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
@@ -21,43 +23,23 @@ using F32 = float;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
8
,
// MPerThread
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
std
::
size_t
N
=
A_nchw
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
C
=
A_nchw
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
H
=
A_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
W
=
A_nchw
.
mDesc
.
GetLengths
()[
3
];
for
(
std
::
size_t
w
=
0
;
w
<
W
;
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
H
;
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
C
;
++
c
)
for
(
std
::
size_t
n
=
0
;
n
<
N
;
++
n
)
{
ADataType
tmp_val
;
auto
a_val
=
A_nchw
.
mData
[(
n
)
+
(
c
*
N
)
+
(
h
*
C
*
N
)
+
(
w
*
H
*
C
*
N
)];
functor_b
(
tmp_val
,
a_val
);
functor_a
(
B_nhwc
.
mData
[(
n
)
+
(
c
*
W
*
H
*
N
)
+
(
h
*
N
)
+
(
w
*
H
*
N
)],
scale
*
tmp_val
);
}
}
using
UnaryScale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
UnarySquare
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
UnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
UnaryCombinedOp
<
UnarySquare
,
UnaryScale
>
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
UnaryScaleSquare
,
// UnaryScaleSquare
4
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
>>
;
// OutScalarPerVectorSeq
int
main
()
{
...
...
@@ -66,8 +48,21 @@ int main()
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
8
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
8
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
1
,
static_cast
<
int
>
(
nchw
[
0
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
])};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
1
,
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
]
*
nhwc
[
2
]),
static_cast
<
int
>
(
nhwc
[
0
]),
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
float
scale
=
1.
f
;
auto
i
=
0
;
std
::
mt19937
gen
(
11939
);
...
...
@@ -90,28 +85,14 @@ int main()
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
1
,
static_cast
<
int
>
(
nchw
[
0
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
])};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
1
,
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
]
*
nhwc
[
2
]),
static_cast
<
int
>
(
nhwc
[
0
]),
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
...
...
@@ -125,11 +106,10 @@ int main()
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
std
::
size_t
flop
=
std
::
size_t
(
5
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
(
2
*
sizeof
(
ADataType
)
+
sizeof
(
BDataType
))
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
...
...
@@ -141,10 +121,17 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
UnaryScaleSquare
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
View file @
7f65ac05
...
...
@@ -5,8 +5,10 @@
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
@@ -20,38 +22,23 @@ using F32 = float;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
8
,
// MPerThread
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_nchw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_nchw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
A_nchw
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_nchw
.
mDesc
.
GetLengths
()[
3
];
++
w
)
{
ADataType
tmp_val
;
auto
a_val
=
A_nchw
(
n
,
c
,
h
,
w
);
functor_b
(
tmp_val
,
a_val
);
functor_a
(
B_nhwc
(
n
,
h
,
w
,
c
),
scale
*
tmp_val
);
}
}
using
UnaryScale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
UnarySquare
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
UnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
UnaryCombinedOp
<
UnarySquare
,
UnaryScale
>
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
UnaryScaleSquare
,
// UnaryScaleSquare
4
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
>>
;
// OutScalarPerVectorSeq
int
main
()
{
...
...
@@ -60,18 +47,6 @@ int main()
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
128
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
float
scale
=
2.
f
;
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
...
...
@@ -85,15 +60,29 @@ int main()
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
float
scale
=
2.
f
;
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
...
...
@@ -123,10 +112,17 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
UnaryScaleSquare
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
View file @
7f65ac05
...
...
@@ -5,8 +5,10 @@
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
@@ -20,53 +22,47 @@ using F32 = float;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
1
,
// MPerThread
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
std
::
size_t
N
=
A_nchw
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
C
=
A_nchw
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
H
=
A_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
W
=
A_nchw
.
mDesc
.
GetLengths
()[
3
];
for
(
std
::
size_t
w
=
0
;
w
<
W
;
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
H
;
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
C
;
++
c
)
for
(
std
::
size_t
n
=
0
;
n
<
N
;
++
n
)
{
ADataType
tmp_val
;
auto
a_val
=
A_nchw
.
mData
[(
n
)
+
(
c
*
N
)
+
(
h
*
C
*
N
)
+
(
w
*
H
*
C
*
N
)];
functor_b
(
tmp_val
,
a_val
);
functor_a
(
B_nhwc
.
mData
[(
n
)
+
(
c
*
W
*
H
*
N
)
+
(
h
*
N
)
+
(
w
*
H
*
N
)],
scale
*
tmp_val
);
}
}
using
UnaryScale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
UnarySquare
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
UnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
UnaryCombinedOp
<
UnarySquare
,
UnaryScale
>
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
UnaryScaleSquare
,
// UnaryScaleSquare
4
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
5
,
4
,
2
,
3
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
5
,
2
,
3
,
4
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
8
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
8
};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
1
,
static_cast
<
int
>
(
nchw
[
0
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
])};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
1
,
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
]
*
nhwc
[
2
]),
static_cast
<
int
>
(
nhwc
[
0
]),
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
float
scale
=
1.
f
;
auto
i
=
0
;
...
...
@@ -90,28 +86,14 @@ int main()
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
1
,
static_cast
<
int
>
(
nchw
[
0
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
])};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
1
,
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
]
*
nhwc
[
2
]),
static_cast
<
int
>
(
nhwc
[
0
]),
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
...
...
@@ -141,10 +123,17 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
UnaryScaleSquare
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
View file @
7f65ac05
...
...
@@ -5,8 +5,10 @@
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
@@ -20,38 +22,23 @@ using F32 = float;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
8
,
// MPerThread
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_nchw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_nchw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
A_nchw
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_nchw
.
mDesc
.
GetLengths
()[
3
];
++
w
)
{
ADataType
tmp_val
;
auto
a_val
=
A_nchw
(
n
,
c
,
h
,
w
);
functor_b
(
tmp_val
,
a_val
);
functor_a
(
B_nhwc
(
n
,
h
,
w
,
c
),
scale
*
tmp_val
);
}
}
using
UnaryScale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
UnarySquare
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
UnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
UnaryCombinedOp
<
UnarySquare
,
UnaryScale
>
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
UnaryScaleSquare
,
// UnaryScaleSquare
4
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
>>
;
// OutScalarPerVectorSeq
int
main
()
{
...
...
@@ -60,18 +47,6 @@ int main()
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
128
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
float
scale
=
2.
f
;
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
...
...
@@ -85,15 +60,28 @@ int main()
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
1
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
a_strides
)};
Tensor
<
ADataType
>&
a
=
as
[
0
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
b_strides
);
float
scale
=
2.
f
;
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
...
...
@@ -123,10 +111,17 @@ int main()
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
b_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
1
,
ADataType
,
BDataType
,
UnaryScaleSquare
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
scale
}});
ref_invoker
.
Run
(
ref_argument
);
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
...
...
example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
0 → 100644
View file @
7f65ac05
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
UnaryScale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
UnarySquare
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
UnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
UnaryCombinedOp
<
UnarySquare
,
UnaryScale
>
;
using
BinaryAdd
=
ck
::
tensor_operation
::
element_wise
::
Add
;
// B = alpha * A0 * A0 + beta * A1 * A1 + gamma * A2 * A2
using
TrinaryAddUnaryScaleSquare
=
ck
::
tensor_operation
::
element_wise
::
TrinaryWithUnaryCombinedOp
<
BinaryAdd
,
BinaryAdd
,
UnaryScaleSquare
,
UnaryScaleSquare
,
UnaryScaleSquare
>
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
,
ADataType
,
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
TrinaryAddUnaryScaleSquare
,
// ElementwiseOp
4
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
,
8
,
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
>>
;
// OutScalarPerVectorSeq
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
ab_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
3
]),
1
};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
std
::
array
<
Tensor
<
ADataType
>
,
3
>
as
=
{
Tensor
<
ADataType
>
(
ab_lengths
,
ab_strides
),
Tensor
<
ADataType
>
(
ab_lengths
,
ab_strides
),
Tensor
<
ADataType
>
(
ab_lengths
,
ab_strides
)};
Tensor
<
ADataType
>&
a0
=
as
[
0
];
Tensor
<
ADataType
>&
a1
=
as
[
1
];
Tensor
<
ADataType
>&
a2
=
as
[
2
];
Tensor
<
BDataType
>
b
(
ab_lengths
,
ab_strides
);
float
alpha
=
3.
f
;
float
beta
=
2.
f
;
float
gamma
=
4.
f
;
a0
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
a1
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
a2
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a0_device_buf
(
sizeof
(
ADataType
)
*
a0
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
a1_device_buf
(
sizeof
(
ADataType
)
*
a1
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
a2_device_buf
(
sizeof
(
ADataType
)
*
a2
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a0_device_buf
.
ToDevice
(
a0
.
mData
.
data
());
a1_device_buf
.
ToDevice
(
a1
.
mData
.
data
());
a2_device_buf
.
ToDevice
(
a2
.
mData
.
data
());
std
::
array
<
const
void
*
,
3
>
inputs
=
{
a0_device_buf
.
GetDeviceBuffer
(),
a1_device_buf
.
GetDeviceBuffer
(),
a2_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
unary_scale_op_a0
=
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
alpha
}};
auto
unary_scale_op_a1
=
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
beta
}};
auto
unary_scale_op_a2
=
UnaryScaleSquare
{
UnarySquare
{},
UnaryScale
{
gamma
}};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
ab_strides
,
ab_strides
,
ab_strides
},
{
ab_strides
},
inputs
,
output
,
TrinaryAddUnaryScaleSquare
{
BinaryAdd
{},
BinaryAdd
{},
unary_scale_op_a0
,
unary_scale_op_a1
,
unary_scale_op_a2
});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A0 (nchw): "
<<
a0
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"A1 (nchw): "
<<
a1
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"A2 (nchw): "
<<
a2
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nchw): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
5
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
BDataType
>
host_b
(
ab_lengths
,
ab_strides
);
using
ReferenceElementwiseInstance
=
ck
::
tensor_operation
::
host
::
ReferenceElementwise
<
3
,
ADataType
,
BDataType
,
TrinaryAddUnaryScaleSquare
>
;
auto
ref_elementwise
=
ReferenceElementwiseInstance
{};
auto
ref_invoker
=
ref_elementwise
.
MakeInvoker
();
auto
ref_argument
=
ref_elementwise
.
MakeArgument
(
as
,
host_b
,
TrinaryAddUnaryScaleSquare
{
BinaryAdd
{},
BinaryAdd
{},
unary_scale_op_a0
,
unary_scale_op_a1
,
unary_scale_op_a2
});
ref_invoker
.
Run
(
ref_argument
);
const
double
threshold
=
std
::
pow
(
2
,
-
10
)
*
2
;
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
threshold
,
threshold
);
}
return
pass
?
0
:
1
;
}
Prev
1
2
3
4
5
6
7
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment