"...resnet50_tensorflow.git" did not exist on "4dbdb4509e48b33f0eaa39ed9f7e8c059f9fb3b7"
Commit 9b3c4ac4 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'develop' into amd-develop

parents 1d784873 7843a8a7
...@@ -202,7 +202,7 @@ endif() ...@@ -202,7 +202,7 @@ endif()
option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
option(USE_OPT_NAVI3X "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
if(USE_BITINT_EXTENSION_INT4) if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
...@@ -210,10 +210,10 @@ if(USE_BITINT_EXTENSION_INT4) ...@@ -210,10 +210,10 @@ if(USE_BITINT_EXTENSION_INT4)
message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}") message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
endif() endif()
if(USE_OPT_NAVI3X) if(USE_OPT_GFX11)
add_compile_options(-mcumode) add_compile_options(-mcumode)
add_compile_options(-mno-wavefrontsize64) add_compile_options(-mno-wavefrontsize64)
message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}") message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
endif() endif()
## Threads ## Threads
......
...@@ -515,30 +515,25 @@ def Build_CK(Map conf=[:]){ ...@@ -515,30 +515,25 @@ def Build_CK(Map conf=[:]){
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 24, unit: 'HOURS') timeout(time: 24, unit: 'HOURS')
{ {
//check whether running on Navi or MI300 node //check whether to run performance tests on this node
def navi_node = 0 def do_perf_tests = 0
def mi300_node = 0
sh 'rocminfo | tee rocminfo.log' sh 'rocminfo | tee rocminfo.log'
if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') ){ if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){
navi_node = 1 do_perf_tests = 1
echo "This is a Navi node" echo "Stash profiler and run performance tests"
}
if ( runShell('grep -n "gfx942" rocminfo.log') ){
mi300_node = 1
echo "This is MI300 node"
} }
cmake_build(conf) cmake_build(conf)
dir("build"){ dir("build"){
//run tests and examples //run tests and examples
sh 'make -j check' sh 'make -j check'
if (params.RUN_PERFORMANCE_TESTS && navi_node == 0 && mi300_node == 0 ){ if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){
//we only need the ckProfiler to run the performance tests, so we pack and stash it //we only need the ckProfiler to run the performance tests, so we pack and stash it
//do not stash profiler on Navi or MI300 nodes //do not stash profiler on nodes where we don't need to run performance tests
sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler' sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
stash name: "ckProfiler.tar.gz" stash name: "ckProfiler.tar.gz"
} }
if (params.RUN_FULL_QA && mi300_node == 0 ){ if (params.RUN_FULL_QA && do_perf_tests == 0 ){
// build deb packages for all MI100/200/300 targets and prepare to export // build deb packages for all gfx9 targets and prepare to export
sh 'make -j package' sh 'make -j package'
archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
archiveArtifacts artifacts: 'composablekernel-tests_*.deb' archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
...@@ -546,7 +541,7 @@ def Build_CK(Map conf=[:]){ ...@@ -546,7 +541,7 @@ def Build_CK(Map conf=[:]){
stash name: "ckprofiler_0.2.0_amd64.deb" stash name: "ckprofiler_0.2.0_amd64.deb"
} }
} }
if (params.hipTensor_test && navi_node == 0 ){ if (params.hipTensor_test && do_perf_tests == 0 ){
//build and test hipTensor //build and test hipTensor
sh """#!/bin/bash sh """#!/bin/bash
rm -rf "${params.hipTensor_branch}".zip rm -rf "${params.hipTensor_branch}".zip
...@@ -814,7 +809,7 @@ pipeline { ...@@ -814,7 +809,7 @@ pipeline {
{ {
parallel parallel
{ {
stage("Run Codegen Tests on MI200") stage("Run Codegen Tests on gfx90a")
{ {
when { when {
beforeAgent true beforeAgent true
...@@ -865,7 +860,7 @@ pipeline { ...@@ -865,7 +860,7 @@ pipeline {
cleanWs() cleanWs()
} }
} }
stage("Build CK and run Tests on MI300") stage("Build CK and run Tests on gfx942")
{ {
when { when {
beforeAgent true beforeAgent true
...@@ -885,7 +880,7 @@ pipeline { ...@@ -885,7 +880,7 @@ pipeline {
cleanWs() cleanWs()
} }
} }
stage("Build CK and run Tests on MI200") stage("Build CK and run Tests on gfx90a")
{ {
when { when {
beforeAgent true beforeAgent true
...@@ -925,13 +920,13 @@ pipeline { ...@@ -925,13 +920,13 @@ pipeline {
cleanWs() cleanWs()
} }
} }
stage("Build CK and run Tests on Navi21") stage("Build CK and run Tests on gfx1030")
{ {
when { when {
beforeAgent true beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() } expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
} }
agent{ label rocmnode("navi21") } agent{ label rocmnode("gfx1030") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
...@@ -945,13 +940,13 @@ pipeline { ...@@ -945,13 +940,13 @@ pipeline {
cleanWs() cleanWs()
} }
} }
stage("Build CK and run Tests on Navi32") stage("Build CK and run Tests on gfx1101")
{ {
when { when {
beforeAgent true beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() } expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
} }
agent{ label rocmnode("navi32") } agent{ label rocmnode("gfx1101") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include <cstdlib>
#include <iomanip> #include <iomanip>
...@@ -160,6 +160,10 @@ bool run_grouped_conv_bwd_weight( ...@@ -160,6 +160,10 @@ bool run_grouped_conv_bwd_weight(
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace_dev(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
......
...@@ -181,4 +181,3 @@ int main(int argc, char* argv[]) ...@@ -181,4 +181,3 @@ int main(int argc, char* argv[])
{1, 1, 1} /*filter_dilations*/); {1, 1, 1} /*filter_dilations*/);
return 0; return 0;
} }
// MI100 Perf: 0.255178 ms, 1698.9 GB/s,
...@@ -7,17 +7,3 @@ ...@@ -7,17 +7,3 @@
#arg3: run kernel # of times (>1) #arg3: run kernel # of times (>1)
./bin/example_gemm_xdl 0 1 5 ./bin/example_gemm_xdl 0 1 5
``` ```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
arg.c_grid_desc_m_n_{ 3840, 4096}
launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 1.19685 ms, 107.657 TFlops, 78.8501 GB/s
```
...@@ -9,20 +9,3 @@ ...@@ -9,20 +9,3 @@
#arg11 to 12: alpha, beta #arg11 to 12: alpha, beta
./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5 ./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
``` ```
Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16)
```
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
c0_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
arg.c0_grid_desc_m_n_{ 3840, 4096}
arg.c_grid_desc_m_n_{ 3840, 4096}
launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 1 times...
Perf: 0.936965 ms, 137.517 TFlops, 102.959 GB/s
error: 0
max_diff: 0, 558.5, 558.5
```
...@@ -8,16 +8,3 @@ ...@@ -8,16 +8,3 @@
#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE" #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1 ./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
``` ```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.26914 ms, 101.525 TFlops, 100.804 GB/s, DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8>
```
...@@ -3,8 +3,7 @@ add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) ...@@ -3,8 +3,7 @@ add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp) add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
add_example_executable(example_convnd_fwd_xdl_fp8 convnd_fwd_xdl_fp8.cpp) add_example_executable(example_convnd_fwd_xdl_fp8 convnd_fwd_xdl_fp8.cpp)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
add_example_executable(example_convnd_fwd_xdl_bf8 convnd_fwd_xdl_bf8.cpp) add_example_executable(example_convnd_fwd_xdl_bf8 convnd_fwd_xdl_bf8.cpp)
add_example_executable(example_convnd_fwd_xdl_fp16_comp_fp8 convnd_fwd_xdl_fp16_comp_fp8.cpp) add_example_executable(example_convnd_fwd_xdl_fp16_comp_fp8 convnd_fwd_xdl_fp16_comp_fp8.cpp)
add_example_executable(example_convnd_fwd_xdl_fp8_bf8 convnd_fwd_xdl_fp8_bf8.cpp) add_example_executable(example_convnd_fwd_xdl_fp8_bf8 convnd_fwd_xdl_fp8_bf8.cpp)
......
...@@ -16,17 +16,3 @@ ...@@ -16,17 +16,3 @@
# <right padding>, (ie RightPy, RightPx for 2D) # <right padding>, (ie RightPy, RightPx for 2D)
./bin/example_convnd_fwd_xdl 0 1 100 ./bin/example_convnd_fwd_xdl 0 1 100
``` ```
Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)
```
input: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
weights: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
output: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
arg.a_grid_desc_k0_m_k1_{432, 165888, 4}
arg.b_grid_desc_k0_n_k1_{432, 256, 4}
arg.c_grid_desc_m_n_{ 165888, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 100 times...
Perf: 4.43736 ms, 33.0753 TFlops, 150.357 GB/s
```
...@@ -7,19 +7,3 @@ ...@@ -7,19 +7,3 @@
#arg3: run kernel # of times (>1) #arg3: run kernel # of times (>1)
./bin/example_grouped_gemm_xdl_fp16 0 1 5 ./bin/example_grouped_gemm_xdl_fp16 0 1 5
``` ```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
gemm[0] a_m_k: dim 2, lengths {256, 64}, strides {64, 1} b_k_n: dim 2, lengths {64, 128}, strides {1, 64} c_m_n: dim 2, lengths {256, 128}, strides {128, 1}
gemm[1] a_m_k: dim 2, lengths {512, 128}, strides {128, 1} b_k_n: dim 2, lengths {128, 256}, strides {1, 128} c_m_n: dim 2, lengths {512, 256}, strides {256, 1}
gemm[2] a_m_k: dim 2, lengths {768, 192}, strides {192, 1} b_k_n: dim 2, lengths {192, 384}, strides {1, 192} c_m_n: dim 2, lengths {768, 384}, strides {384, 1}
gemm[3] a_m_k: dim 2, lengths {1024, 256}, strides {256, 1} b_k_n: dim 2, lengths {256, 512}, strides {1, 256} c_m_n: dim 2, lengths {1024, 512}, strides {512, 1}
group: 0 arg.a_grid_desc_k0_m_k1_{8, 256, 8}, arg.b_grid_desc_k0_n_k1_{8, 128, 8}, arg.c_grid_desc_m_n_{ 256, 128}
group: 1 arg.a_grid_desc_k0_m_k1_{16, 512, 8}, arg.b_grid_desc_k0_n_k1_{16, 256, 8}, arg.c_grid_desc_m_n_{ 512, 256}
group: 2 arg.a_grid_desc_k0_m_k1_{24, 768, 8}, arg.b_grid_desc_k0_n_k1_{24, 384, 8}, arg.c_grid_desc_m_n_{ 768, 384}
group: 3 arg.a_grid_desc_k0_m_k1_{32, 1024, 8}, arg.b_grid_desc_k0_n_k1_{32, 512, 8}, arg.c_grid_desc_m_n_{ 1024, 512}
launch_and_time_kernel: grid_dim {30, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 0.037887 ms, 11.0706 TFlops, 90.8132 GB/s, DeviceGroupedGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2>
```
...@@ -92,9 +92,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -92,9 +92,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
auto group_count = problem_size.group_count; auto group_count = problem_size.group_count;
using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDs>; using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDs>;
using GemmDesc = ck::tensor_operation::device::GemmDesc;
// GEMM shape // GEMM shape
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs; std::vector<GemmDesc> gemm_descs;
std::vector<KernelArguments> ggemm_kargs; std::vector<KernelArguments> ggemm_kargs;
std::vector<void*> p_Cs; std::vector<void*> p_Cs;
std::vector<const void*> p_As; std::vector<const void*> p_As;
......
...@@ -7,14 +7,3 @@ ...@@ -7,14 +7,3 @@
#arg3: time kernel (0=no, 1=yes) #arg3: time kernel (0=no, 1=yes)
./bin/example_contraction_bilinear_xdl_fp32 1 1 1 ./bin/example_contraction_bilinear_xdl_fp32 1 1 1
``` ```
Result (MI100 @ dynammic freq, 46TFlops peak FP32)
```
a_ms_ks: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
b_ks_ns: dim 4, lengths {32, 64, 32, 64}, strides {128, 1, 524288, 4096}
c_ms_ns: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 0.843286 ms, 38.1985 TFlops, 94.5014 GB/s, DeviceContractionMultipleD_Xdl_CShuffle<256, 256, 128, 16, 4, 4>
```
...@@ -16,15 +16,3 @@ Following arguments (depending on number of spatial dims): ...@@ -16,15 +16,3 @@ Following arguments (depending on number of spatial dims):
./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1 ./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
``` ```
Result (MI100)
```
in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Default>
```
...@@ -8,19 +8,3 @@ ...@@ -8,19 +8,3 @@
#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE" #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
./bin/example_gemm_add_multiply_dl_fp16 1 1 1 ./bin/example_gemm_add_multiply_dl_fp16 1 1 1
``` ```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
arg.e_grid_desc_m_n_{ 3840, 4096}
launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 3.99904 ms, 32.22 TFlops, 31.9913 GB/s, DeviceGemmMultipleD_Dl<256, 128, 128, 16, 2, 4, 4, 1>
```
...@@ -44,9 +44,9 @@ args: ...@@ -44,9 +44,9 @@ args:
-range_v per-tensor quantization range of v. used if squant=1. (default:16) -range_v per-tensor quantization range of v. used if squant=1. (default:16)
-range_p per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1) -range_p per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1)
-range_o per-tensor quantization range of o (p*v). used if squant=1. (default:16) -range_o per-tensor quantization range of o (p*v). used if squant=1. (default:16)
-squant if using static quantization fusion or not. 0: original flow(not prefered) (default:0) -squant if using static quantization fusion or not. auto: fp8 will default use squant, other will not (default:auto)
1: apply scale_p and scale_o with respect to P and O. calculate scale_s, scale_p, 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
scale_o according to range_q, range_k, range_v, range_p, range_o calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
-iperm permute input (default:1) -iperm permute input (default:1)
if true, will be b*h*s*d, else b*s*h*d if true, will be b*h*s*d, else b*s*h*d
-operm permute output (default:1) -operm permute output (default:1)
...@@ -64,8 +64,11 @@ args: ...@@ -64,8 +64,11 @@ args:
-vlayout r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r) -vlayout r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r)
-lse 0 not store lse, 1 store lse (default:0) -lse 0 not store lse, 1 store lse (default:0)
-kname if set to 1 will print kernel name (default:0) -kname if set to 1 will print kernel name (default:0)
-init init method. 0:random int, 1:random float, 2:trig float, 3:quantization (default:1) -init init method. ui, uniform random int, ni, normalized random int (default:uf)
uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
-seed random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939) -seed random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939)
-warmup number of iterations before benchmark the kernel (default:5)
-repeat number of iterations to benchmark the kernel (default:20)
``` ```
Example: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case. Example: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
......
...@@ -60,12 +60,14 @@ auto create_args(int argc, char* argv[]) ...@@ -60,12 +60,14 @@ auto create_args(int argc, char* argv[])
.insert("range_v", "16", "per-tensor quantization range of v. used if squant=1.") .insert("range_v", "16", "per-tensor quantization range of v. used if squant=1.")
.insert("range_p", "1", "per-tensor quantization range of p [e^(s-m)]. used if squant=1.") .insert("range_p", "1", "per-tensor quantization range of p [e^(s-m)]. used if squant=1.")
.insert("range_o", "16", "per-tensor quantization range of o (p*v). used if squant=1.") .insert("range_o", "16", "per-tensor quantization range of o (p*v). used if squant=1.")
.insert( .insert("squant",
"squant", "auto",
"0", "if using static quantization fusion or not. auto: fp8 will default use squant, "
"if using static quantization fusion or not. 0: original flow(not prefered)\n" "other will not\n"
"1: apply scale_p and scale_o with respect to P and O. calculate scale_s, scale_p,\n" "0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to "
"scale_o according to range_q, range_k, range_v, range_p, range_o") "P and O.\n"
"calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, "
"range_p, range_o")
.insert("iperm", .insert("iperm",
"1", "1",
"permute input\n" "permute input\n"
...@@ -92,8 +94,11 @@ auto create_args(int argc, char* argv[]) ...@@ -92,8 +94,11 @@ auto create_args(int argc, char* argv[])
.insert("vlayout", "r", "r for row-major(seqlen*hdim), c for col-major(hdim*seqlen)") .insert("vlayout", "r", "r for row-major(seqlen*hdim), c for col-major(hdim*seqlen)")
.insert("lse", "0", "0 not store lse, 1 store lse") .insert("lse", "0", "0 not store lse, 1 store lse")
.insert("kname", "0", "if set to 1 will print kernel name") .insert("kname", "0", "if set to 1 will print kernel name")
.insert( .insert("init",
"init", "1", "init method. 0:random int, 1:random float, 2:trig float, 3:quantization") "uf",
"init method. ui, uniform random int, ni, normalized random int\n"
"uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, "
"quantization")
.insert("seed", .insert("seed",
"11939", "11939",
"random seed used for initializing input tensors. 0 for " "random seed used for initializing input tensors. 0 for "
...@@ -107,7 +112,7 @@ auto create_args(int argc, char* argv[]) ...@@ -107,7 +112,7 @@ auto create_args(int argc, char* argv[])
// different threshold for different dtype // different threshold for different dtype
template <typename DataType> template <typename DataType>
auto get_elimit(int /*init_method*/) auto get_elimit(std::string /*init_method*/)
{ {
double rtol = 1e-3; double rtol = 1e-3;
double atol = 1e-3; double atol = 1e-3;
...@@ -115,9 +120,15 @@ auto get_elimit(int /*init_method*/) ...@@ -115,9 +120,15 @@ auto get_elimit(int /*init_method*/)
} }
template <> template <>
auto get_elimit<ck_tile::bf16_t>(int init_method) auto get_elimit<ck_tile::bf16_t>(std::string init_method)
{ {
if(init_method == 0) if(init_method == "ui" || init_method == "ni")
{
double rtol = 1e-2;
double atol = 1e-2;
return ck_tile::make_tuple(rtol, atol);
}
else if(init_method == "nf")
{ {
double rtol = 1e-2; double rtol = 1e-2;
double atol = 1e-2; double atol = 1e-2;
...@@ -132,9 +143,9 @@ auto get_elimit<ck_tile::bf16_t>(int init_method) ...@@ -132,9 +143,9 @@ auto get_elimit<ck_tile::bf16_t>(int init_method)
} }
template <> template <>
auto get_elimit<ck_tile::fp8_t>(int init_method) auto get_elimit<ck_tile::fp8_t>(std::string init_method)
{ {
if(init_method == 0) if(init_method == "ui" || init_method == "ni")
{ {
unsigned max_rounding_point_distance = 0; unsigned max_rounding_point_distance = 0;
double atol = 2e-3; double atol = 2e-3;
...@@ -182,15 +193,18 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -182,15 +193,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
if(scale_s == .0f) if(scale_s == .0f)
scale_s = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q)); // TODO: q ? v ? scale_s = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q)); // TODO: q ? v ?
bool squant = arg_parser.get_bool("squant"); std::string squant_str = arg_parser.get_str("squant");
if constexpr(!std::is_same_v<DataType, ck_tile::fp8_t>) bool squant = [&]() {
{ if(squant_str == "auto")
if(squant)
{ {
std::cerr << "static quantization only support fp8 for now" << std::endl; if(data_type == "fp8")
return false; return true;
else
return false;
} }
} else
return atoi(squant_str.c_str()) != 0 ? true : false;
}();
float range_q = arg_parser.get_float("range_q"); float range_q = arg_parser.get_float("range_q");
float range_k = arg_parser.get_float("range_k"); float range_k = arg_parser.get_float("range_k");
...@@ -217,7 +231,7 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -217,7 +231,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
bias_info bias = bias_info::decode(arg_parser.get_str("bias")); bias_info bias = bias_info::decode(arg_parser.get_str("bias"));
mask_info mask = mask_info::decode(arg_parser.get_str("mask"), seqlen_q, seqlen_k); mask_info mask = mask_info::decode(arg_parser.get_str("mask"), seqlen_q, seqlen_k);
int init_method = arg_parser.get_int("init"); std::string init_method = arg_parser.get_str("init");
std::optional<uint32_t> seed = arg_parser.get_uint32("seed"); std::optional<uint32_t> seed = arg_parser.get_uint32("seed");
if(*seed == 0) if(*seed == 0)
{ {
...@@ -319,28 +333,43 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -319,28 +333,43 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::HostTensor<ODataType> o_host( ck_tile::HostTensor<ODataType> o_host(
get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v)); get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
if(init_method == 0) if(init_method == "ui" || init_method == "0")
{ {
ck_tile::FillUniformDistributionIntegerValue<QDataType>{-2.f, 2.f, seed}(q_host); ck_tile::FillUniformDistributionIntegerValue<QDataType>{-3.f, 3.f, seed}(q_host);
ck_tile::FillUniformDistributionIntegerValue<KDataType>{-2.f, 2.f, seed}(k_host); ck_tile::FillUniformDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(k_host);
ck_tile::FillUniformDistributionIntegerValue<VDataType>{-2.f, 2.f, seed}(v_host); ck_tile::FillUniformDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(v_host);
ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-2.f, 2.f, seed}(bias_host); ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-3.f, 3.f, seed}(bias_host);
} }
else if(init_method == 1) else if(init_method == "ni")
{
ck_tile::FillNormalDistributionIntegerValue<QDataType>{-3.f, 3.f, seed}(q_host);
ck_tile::FillNormalDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(k_host);
ck_tile::FillNormalDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(v_host);
ck_tile::FillNormalDistributionIntegerValue<BiasDataType>{-3.f, 3.f, seed}(bias_host);
}
else if(init_method == "uf" || init_method == "1")
{ {
ck_tile::FillUniformDistribution<QDataType>{0.f, 1.f, seed}(q_host); ck_tile::FillUniformDistribution<QDataType>{0.f, 1.f, seed}(q_host);
ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, seed}(k_host); ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, seed}(k_host);
ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, seed}(v_host); ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, seed}(v_host);
ck_tile::FillUniformDistribution<BiasDataType>{0.f, 1.f, seed}(bias_host); ck_tile::FillUniformDistribution<BiasDataType>{0.f, 1.f, seed}(bias_host);
} }
else if(init_method == 2) else if(init_method == "nf")
{
ck_tile::FillNormalDistribution<QDataType>{0.f, 3.f, seed}(q_host);
ck_tile::FillNormalDistribution<KDataType>{0.f, 3.f, seed}(k_host);
ck_tile::FillNormalDistribution<VDataType>{0.f, 3.f, seed}(v_host);
ck_tile::FillNormalDistribution<BiasDataType>{0.f, 3.f, seed}(bias_host);
}
else if(init_method == "tf" || init_method == "2")
{ {
ck_tile::FillTrigValue<QDataType>{}(q_host); ck_tile::FillTrigValue<QDataType>{}(q_host);
ck_tile::FillTrigValue<KDataType>{}(k_host); ck_tile::FillTrigValue<KDataType>{}(k_host);
ck_tile::FillTrigValue<VDataType>{}(v_host); ck_tile::FillTrigValue<VDataType>{}(v_host);
ck_tile::FillTrigValue<BiasDataType>{}(bias_host); ck_tile::FillTrigValue<BiasDataType>{}(bias_host);
} }
else if(init_method == 3) // suitable for fp8 quantization else if(init_method == "ufq" || init_method == "uf:q" ||
init_method == "3") // suitable for fp8 quantization
{ {
ck_tile::FillUniformDistribution<QDataType>{-dtype_max, dtype_max, seed}(q_host); ck_tile::FillUniformDistribution<QDataType>{-dtype_max, dtype_max, seed}(q_host);
ck_tile::FillUniformDistribution<KDataType>{-dtype_max, dtype_max, seed}(k_host); ck_tile::FillUniformDistribution<KDataType>{-dtype_max, dtype_max, seed}(k_host);
......
...@@ -4,12 +4,19 @@ ...@@ -4,12 +4,19 @@
#pragma once #pragma once
#include "ck/config.h" #include "ck/config.h"
#include "ck/utility/env.hpp"
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h" #include "hip/hip_fp16.h"
#endif #endif
// environment variable to enable logging:
// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
// to do: add various levels of logging with CK_LOG_LEVEL
#define CK_TIME_KERNEL 1 #define CK_TIME_KERNEL 1
// constant address space for kernel parameter // constant address space for kernel parameter
...@@ -225,14 +232,11 @@ ...@@ -225,14 +232,11 @@
// workaround: compiler issue on gfx908 // workaround: compiler issue on gfx908
#define CK_WORKAROUND_SWDEV_388832 1 #define CK_WORKAROUND_SWDEV_388832 1
// flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0
// denorm test fix, required to work around dissue // denorm test fix, required to work around dissue
#ifndef CK_WORKAROUND_DENORM_FIX #ifndef CK_WORKAROUND_DENORM_FIX
#define CK_WORKAROUND_DENORM_FIX 0 #define CK_WORKAROUND_DENORM_FIX 0
#else #else
// enable only on MI200 // enable only for gfx90a
#define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__) #define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
#endif // CK_WORKAROUND_DENORM_FIX #endif // CK_WORKAROUND_DENORM_FIX
......
...@@ -65,20 +65,20 @@ inline bool is_lds_direct_load_supported() ...@@ -65,20 +65,20 @@ inline bool is_lds_direct_load_supported()
ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942"; ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942";
} }
inline bool is_navi1_supported() inline bool is_gfx101_supported()
{ {
return ck::get_device_name() == "gfx1010" || ck::get_device_name() == "gfx1011" || return ck::get_device_name() == "gfx1010" || ck::get_device_name() == "gfx1011" ||
ck::get_device_name() == "gfx1012"; ck::get_device_name() == "gfx1012";
} }
inline bool is_navi2_supported() inline bool is_gfx103_supported()
{ {
return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" || return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" ||
ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" || ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" ||
ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036"; ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
} }
inline bool is_navi3_supported() inline bool is_gfx11_supported()
{ {
return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103"; ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";
......
...@@ -117,18 +117,19 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -117,18 +117,19 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
#define MEDIAN 1 #define MEDIAN 1
if(stream_config.time_kernel_) if(stream_config.time_kernel_)
{ {
#if DEBUG_LOG if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", {
__func__, printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
grid_dim.x, __func__,
grid_dim.y, grid_dim.x,
grid_dim.z, grid_dim.y,
block_dim.x, grid_dim.z,
block_dim.y, block_dim.x,
block_dim.z); block_dim.y,
block_dim.z);
printf("Warm up %d times\n", stream_config.cold_niters_);
#endif printf("Warm up %d times\n", stream_config.cold_niters_);
}
// warm up // warm up
for(int i = 0; i < stream_config.cold_niters_; ++i) for(int i = 0; i < stream_config.cold_niters_; ++i)
{ {
...@@ -141,9 +142,10 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -141,9 +142,10 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
{ {
return 0.0; return 0.0;
} }
#if DEBUG_LOG if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
printf("Start running %d times...\n", nrepeat); {
#endif printf("Start running %d times...\n", nrepeat);
}
#if MEDIAN #if MEDIAN
std::set<float> times; std::set<float> times;
...@@ -184,13 +186,14 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -184,13 +186,14 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
total_time += cur_time; total_time += cur_time;
#endif #endif
#if DEBUG_LOG if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
std::cout << "i: " << i << " cur_time: " << cur_time << std::endl; {
std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
printf("args.p_a_grid: %p, args.p_b_grid:%p\n", printf("args.p_a_grid: %p, args.p_b_grid:%p\n",
static_cast<const void*>(args.p_a_grid), static_cast<const void*>(args.p_a_grid),
static_cast<const void*>(args.p_b_grid)); static_cast<const void*>(args.p_b_grid));
#endif }
} }
#if MEDIAN #if MEDIAN
......
...@@ -20,18 +20,19 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -20,18 +20,19 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
#if CK_TIME_KERNEL #if CK_TIME_KERNEL
if(stream_config.time_kernel_) if(stream_config.time_kernel_)
{ {
#if DEBUG_LOG if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", {
__func__, printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
grid_dim.x, __func__,
grid_dim.y, grid_dim.x,
grid_dim.z, grid_dim.y,
block_dim.x, grid_dim.z,
block_dim.y, block_dim.x,
block_dim.z); block_dim.y,
block_dim.z);
printf("Warm up %d times\n", stream_config.cold_niters_);
#endif printf("Warm up %d times\n", stream_config.cold_niters_);
}
// warm up // warm up
for(int i = 0; i < stream_config.cold_niters_; ++i) for(int i = 0; i < stream_config.cold_niters_; ++i)
{ {
...@@ -40,9 +41,10 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -40,9 +41,10 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
} }
const int nrepeat = stream_config.nrepeat_; const int nrepeat = stream_config.nrepeat_;
#if DEBUG_LOG if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
printf("Start running %d times...\n", nrepeat); {
#endif printf("Start running %d times...\n", nrepeat);
}
hipEvent_t start, stop; hipEvent_t start, stop;
hip_check_error(hipEventCreate(&start)); hip_check_error(hipEventCreate(&start));
...@@ -93,18 +95,19 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -93,18 +95,19 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
#if CK_TIME_KERNEL #if CK_TIME_KERNEL
if(stream_config.time_kernel_) if(stream_config.time_kernel_)
{ {
#if DEBUG_LOG if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", {
__func__, printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
grid_dim.x, __func__,
grid_dim.y, grid_dim.x,
grid_dim.z, grid_dim.y,
block_dim.x, grid_dim.z,
block_dim.y, block_dim.x,
block_dim.z); block_dim.y,
block_dim.z);
printf("Warm up %d times\n", stream_config.cold_niters_);
#endif printf("Warm up %d times\n", stream_config.cold_niters_);
}
// warm up // warm up
preprocess(); preprocess();
for(int i = 0; i < stream_config.cold_niters_; ++i) for(int i = 0; i < stream_config.cold_niters_; ++i)
...@@ -114,9 +117,10 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -114,9 +117,10 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
} }
const int nrepeat = stream_config.nrepeat_; const int nrepeat = stream_config.nrepeat_;
#if DEBUG_LOG if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
printf("Start running %d times...\n", nrepeat); {
#endif printf("Start running %d times...\n", nrepeat);
}
hipEvent_t start, stop; hipEvent_t start, stop;
hip_check_error(hipEventCreate(&start)); hip_check_error(hipEventCreate(&start));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment