Merge branch 'develop' into amd-develop

9b3c4ac4 · Jun Liu · 1d784873 · 7843a8a7 · 9b3c4ac4 · 9b3c4ac4
Commit 9b3c4ac4 authored May 14, 2024 by Jun Liu
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,7 +202,7 @@ endif()
 option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
-option(USE_OPT_NAVI3X "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
+option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
 if(USE_BITINT_EXTENSION_INT4)
    add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
@@ -210,10 +210,10 @@ if(USE_BITINT_EXTENSION_INT4)
    message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
 endif()
-if(USE_OPT_NAVI3X)
+if(USE_OPT_GFX11)
    add_compile_options(-mcumode)
    add_compile_options(-mno-wavefrontsize64)
-    message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}")
+    message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
 endif()
 ## Threads

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -515,30 +515,25 @@ def Build_CK(Map conf=[:]){
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 24, unit: 'HOURS')
                {
-                    //check whether running on Navi or MI300 node
+                    //check whether to run performance tests on this node
-                    def navi_node = 0
+                    def do_perf_tests = 0
-                    def mi300_node = 0
                    sh 'rocminfo | tee rocminfo.log'
-                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') ){
+                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){
-                        navi_node = 1
+                        do_perf_tests = 1
-                        echo "This is a Navi node"
+                        echo "Stash profiler and run performance tests"
-                    }
-                    if ( runShell('grep -n "gfx942" rocminfo.log') ){
-                        mi300_node = 1
-                        echo "This is MI300 node"
                    }
                    cmake_build(conf)
                    dir("build"){
                        //run tests and examples
                        sh 'make -j check'
-                        if (params.RUN_PERFORMANCE_TESTS && navi_node == 0 && mi300_node == 0 ){
+                        if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){
                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                            //do not stash profiler on Navi or MI300 nodes
+                            //do not stash profiler on nodes where we don't need to run performance tests
                            sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
                            stash name: "ckProfiler.tar.gz"
                        }
-                        if (params.RUN_FULL_QA && mi300_node == 0 ){
+                        if (params.RUN_FULL_QA && do_perf_tests == 0 ){
-                            // build deb packages for all MI100/200/300 targets and prepare to export
+                            // build deb packages for all gfx9 targets and prepare to export
                            sh 'make -j package'
                            archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
                            archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
@@ -546,7 +541,7 @@ def Build_CK(Map conf=[:]){
                            stash name: "ckprofiler_0.2.0_amd64.deb"
                        }
                    }
-                    if (params.hipTensor_test && navi_node == 0 ){
+                    if (params.hipTensor_test && do_perf_tests == 0 ){
                        //build and test hipTensor
                        sh """#!/bin/bash
                            rm -rf "${params.hipTensor_branch}".zip
@@ -814,7 +809,7 @@ pipeline {
        {
            parallel
            {
-                stage("Run Codegen Tests on MI200")
+                stage("Run Codegen Tests on gfx90a")
                {
                    when {
                        beforeAgent true
@@ -865,7 +860,7 @@ pipeline {
                        cleanWs()
                    }
                }
-                stage("Build CK and run Tests on MI300")
+                stage("Build CK and run Tests on gfx942")
                {
                    when {
                        beforeAgent true
@@ -885,7 +880,7 @@ pipeline {
                        cleanWs()
                    }
                }
-                stage("Build CK and run Tests on MI200")
+                stage("Build CK and run Tests on gfx90a")
                {
                    when {
                        beforeAgent true
@@ -925,13 +920,13 @@ pipeline {
                        cleanWs()
                    }
                }
-                stage("Build CK and run Tests on Navi21")
+                stage("Build CK and run Tests on gfx1030")
                {
                    when {
                        beforeAgent true
                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
                    }
-                    agent{ label rocmnode("navi21") }
+                    agent{ label rocmnode("gfx1030") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ 
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
@@ -945,13 +940,13 @@ pipeline {
                        cleanWs()
                    }
                }
-                stage("Build CK and run Tests on Navi32")
+                stage("Build CK and run Tests on gfx1101")
                {
                    when {
                        beforeAgent true
                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
                    }
-                    agent{ label rocmnode("navi32") }
+                    agent{ label rocmnode("gfx1101") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \

--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <cstdlib>
 #include <iomanip>
@@ -160,6 +160,10 @@ bool run_grouped_conv_bwd_weight(
        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
        std::string op_name = op_ptr->GetTypeString();
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        SimpleDeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

--- a/client_example/25_wrapper/wrapper_img2col.cpp
+++ b/client_example/25_wrapper/wrapper_img2col.cpp
@@ -181,4 +181,3 @@ int main(int argc, char* argv[])
                             {1, 1, 1} /*filter_dilations*/);
    return 0;
 }
-// MI100 Perf:   0.255178 ms, 1698.9 GB/s,
--- a/example/01_gemm/README.md
+++ b/example/01_gemm/README.md
@@ -7,17 +7,3 @@
 #arg3: run kernel # of times (>1)
 ./bin/example_gemm_xdl 0 1 5
 ```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
-arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.19685 ms, 107.657 TFlops, 78.8501 GB/s
-```
--- a/example/02_gemm_bilinear/README.md
+++ b/example/02_gemm_bilinear/README.md
@@ -9,20 +9,3 @@
 #arg11 to 12: alpha, beta
 ./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
 ```
-Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c0_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
-arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
-arg.c0_grid_desc_m_n_{ 3840, 4096}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 1 times...
-Perf: 0.936965 ms, 137.517 TFlops, 102.959 GB/s
-error: 0
-max_diff: 0, 558.5, 558.5
-```
--- a/example/04_gemm_add_add_fastgelu/README.md
+++ b/example/04_gemm_add_add_fastgelu/README.md
@@ -8,16 +8,3 @@
 #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
 ./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
 ```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
-d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.26914 ms, 101.525 TFlops, 100.804 GB/s, DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8>
-```
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -3,8 +3,7 @@ add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
 add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
 add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
 add_example_executable(example_convnd_fwd_xdl_fp8 convnd_fwd_xdl_fp8.cpp)
-# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
-add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
 add_example_executable(example_convnd_fwd_xdl_bf8 convnd_fwd_xdl_bf8.cpp)
 add_example_executable(example_convnd_fwd_xdl_fp16_comp_fp8 convnd_fwd_xdl_fp16_comp_fp8.cpp)
 add_example_executable(example_convnd_fwd_xdl_fp8_bf8 convnd_fwd_xdl_fp8_bf8.cpp)

--- a/example/09_convnd_fwd/README.md
+++ b/example/09_convnd_fwd/README.md
@@ -16,17 +16,3 @@
 # <right padding>, (ie RightPy, RightPx for 2D)
 ./bin/example_convnd_fwd_xdl 0 1 100
 ```
-Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)
-```
-input: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-weights: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-output: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{432, 165888, 4}
-arg.b_grid_desc_k0_n_k1_{432, 256, 4}
-arg.c_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 100 times...
-Perf: 4.43736 ms, 33.0753 TFlops, 150.357 GB/s
-```
--- a/example/15_grouped_gemm/README.md
+++ b/example/15_grouped_gemm/README.md
@@ -7,19 +7,3 @@
 #arg3: run kernel # of times (>1)
 ./bin/example_grouped_gemm_xdl_fp16 0 1 5
 ```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-gemm[0] a_m_k: dim 2, lengths {256, 64}, strides {64, 1} b_k_n: dim 2, lengths {64, 128}, strides {1, 64} c_m_n: dim 2, lengths {256, 128}, strides {128, 1}
-gemm[1] a_m_k: dim 2, lengths {512, 128}, strides {128, 1} b_k_n: dim 2, lengths {128, 256}, strides {1, 128} c_m_n: dim 2, lengths {512, 256}, strides {256, 1}
-gemm[2] a_m_k: dim 2, lengths {768, 192}, strides {192, 1} b_k_n: dim 2, lengths {192, 384}, strides {1, 192} c_m_n: dim 2, lengths {768, 384}, strides {384, 1}
-gemm[3] a_m_k: dim 2, lengths {1024, 256}, strides {256, 1} b_k_n: dim 2, lengths {256, 512}, strides {1, 256} c_m_n: dim 2, lengths {1024, 512}, strides {512, 1}
-group: 0 arg.a_grid_desc_k0_m_k1_{8, 256, 8}, arg.b_grid_desc_k0_n_k1_{8, 128, 8}, arg.c_grid_desc_m_n_{ 256, 128}
-group: 1 arg.a_grid_desc_k0_m_k1_{16, 512, 8}, arg.b_grid_desc_k0_n_k1_{16, 256, 8}, arg.c_grid_desc_m_n_{ 512, 256}
-group: 2 arg.a_grid_desc_k0_m_k1_{24, 768, 8}, arg.b_grid_desc_k0_n_k1_{24, 384, 8}, arg.c_grid_desc_m_n_{ 768, 384}
-group: 3 arg.a_grid_desc_k0_m_k1_{32, 1024, 8}, arg.b_grid_desc_k0_n_k1_{32, 512, 8}, arg.c_grid_desc_m_n_{ 1024, 512}
-launch_and_time_kernel: grid_dim {30, 1, 1}, block_dim {256, 1, 1} 
-Warm up
-Start running 5 times...
-Perf: 0.037887 ms, 11.0706 TFlops, 90.8132 GB/s, DeviceGroupedGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2>
-```
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -92,9 +92,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
    auto group_count = problem_size.group_count;
    using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDs>;
+    using GemmDesc        = ck::tensor_operation::device::GemmDesc;
    // GEMM shape
-    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    std::vector<GemmDesc> gemm_descs;
    std::vector<KernelArguments> ggemm_kargs;
    std::vector<void*> p_Cs;
    std::vector<const void*> p_As;

--- a/example/26_contraction/README.md
+++ b/example/26_contraction/README.md
@@ -7,14 +7,3 @@
 #arg3: time kernel (0=no, 1=yes)
 ./bin/example_contraction_bilinear_xdl_fp32 1 1 1
 ```
-Result (MI100 @ dynammic freq, 46TFlops peak FP32)
-```
-a_ms_ks: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
-b_ks_ns: dim 4, lengths {32, 64, 32, 64}, strides {128, 1, 524288, 4096}
-c_ms_ns: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
-launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 0.843286 ms, 38.1985 TFlops, 94.5014 GB/s, DeviceContractionMultipleD_Xdl_CShuffle<256, 256, 128, 16, 4, 4>
-```
--- a/example/30_grouped_conv_fwd_multiple_d/README.md
+++ b/example/30_grouped_conv_fwd_multiple_d/README.md
@@ -16,15 +16,3 @@ Following arguments (depending on number of spatial dims):
 ./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```
-Result (MI100)
-```
-in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
-wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
-bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Default>
-```
--- a/example/46_gemm_add_multiply/README.md
+++ b/example/46_gemm_add_multiply/README.md
@@ -8,19 +8,3 @@
 #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
 ./bin/example_gemm_add_multiply_dl_fp16 1 1 1
 ```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
-d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-arg.e_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 3.99904 ms, 32.22 TFlops, 31.9913 GB/s, DeviceGemmMultipleD_Dl<256, 128, 128, 16, 2, 4, 4, 1>
-```
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -44,9 +44,9 @@ args:
    -range_v    per-tensor quantization range of v. used if squant=1. (default:16)
    -range_p    per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1)
    -range_o    per-tensor quantization range of o (p*v). used if squant=1. (default:16)
-     -squant    if using static quantization fusion or not. 0: original flow(not prefered) (default:0)
+     -squant    if using static quantization fusion or not. auto: fp8 will default use squant, other will not (default:auto)
-                 1: apply scale_p and scale_o with respect to P and O. calculate scale_s, scale_p,
+                 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
-                 scale_o according to range_q, range_k, range_v, range_p, range_o
+                 calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
      -iperm    permute input (default:1)
                 if true, will be b*h*s*d, else b*s*h*d
      -operm    permute output (default:1)
@@ -64,8 +64,11 @@ args:
    -vlayout    r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r)
        -lse    0 not store lse, 1 store lse (default:0)
      -kname    if set to 1 will print kernel name (default:0)
-       -init    init method. 0:random int, 1:random float, 2:trig float, 3:quantization (default:1)
+       -init    init method. ui, uniform random int, ni, normalized random int (default:uf)
+                 uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
       -seed    random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939)
+     -warmup    number of iterations before benchmark the kernel (default:5)
+     -repeat    number of iterations to benchmark the kernel (default:20)
 ```
 Example: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -60,12 +60,14 @@ auto create_args(int argc, char* argv[])
        .insert("range_v", "16", "per-tensor quantization range of v. used if squant=1.")
        .insert("range_p", "1", "per-tensor quantization range of p [e^(s-m)]. used if squant=1.")
        .insert("range_o", "16", "per-tensor quantization range of o (p*v). used if squant=1.")
-        .insert(
+        .insert("squant",
-            "squant",
+                "auto",
-            "0",
+                "if using static quantization fusion or not. auto: fp8 will default use squant, "
-            "if using static quantization fusion or not. 0: original flow(not prefered)\n"
+                "other will not\n"
-            "1: apply scale_p and scale_o with respect to P and O. calculate scale_s, scale_p,\n"
+                "0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to "
-            "scale_o according to range_q, range_k, range_v, range_p, range_o")
+                "P and O.\n"
+                "calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, "
+                "range_p, range_o")
        .insert("iperm",
                "1",
                "permute input\n"
@@ -92,8 +94,11 @@ auto create_args(int argc, char* argv[])
        .insert("vlayout", "r", "r for row-major(seqlen*hdim), c for col-major(hdim*seqlen)")
        .insert("lse", "0", "0 not store lse, 1 store lse")
        .insert("kname", "0", "if set to 1 will print kernel name")
-        .insert(
+        .insert("init",
-            "init", "1", "init method. 0:random int, 1:random float, 2:trig float, 3:quantization")
+                "uf",
+                "init method. ui, uniform random int, ni, normalized random int\n"
+                "uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, "
+                "quantization")
        .insert("seed",
                "11939",
                "random seed used for initializing input tensors. 0 for "
@@ -107,7 +112,7 @@ auto create_args(int argc, char* argv[])
 // different threshold for different dtype
 template <typename DataType>
-auto get_elimit(int /*init_method*/)
+auto get_elimit(std::string /*init_method*/)
 {
    double rtol = 1e-3;
    double atol = 1e-3;
@@ -115,9 +120,15 @@ auto get_elimit(int /*init_method*/)
 }
 template <>
-auto get_elimit<ck_tile::bf16_t>(int init_method)
+auto get_elimit<ck_tile::bf16_t>(std::string init_method)
 {
-    if(init_method == 0)
+    if(init_method == "ui" || init_method == "ni")
+    {
+        double rtol = 1e-2;
+        double atol = 1e-2;
+        return ck_tile::make_tuple(rtol, atol);
+    }
+    else if(init_method == "nf")
    {
        double rtol = 1e-2;
        double atol = 1e-2;
@@ -132,9 +143,9 @@ auto get_elimit<ck_tile::bf16_t>(int init_method)
 }
 template <>
-auto get_elimit<ck_tile::fp8_t>(int init_method)
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
 {
-    if(init_method == 0)
+    if(init_method == "ui" || init_method == "ni")
    {
        unsigned max_rounding_point_distance = 0;
        double atol                          = 2e-3;
@@ -182,15 +193,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
    if(scale_s == .0f)
        scale_s = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q)); // TODO: q ? v ?
-    bool squant = arg_parser.get_bool("squant");
+    std::string squant_str = arg_parser.get_str("squant");
-    if constexpr(!std::is_same_v<DataType, ck_tile::fp8_t>)
+    bool squant            = [&]() {
-    {
+        if(squant_str == "auto")
-        if(squant)
        {
-            std::cerr << "static quantization only support fp8 for now" << std::endl;
+            if(data_type == "fp8")
-            return false;
+                return true;
+            else
+                return false;
        }
-    }
+        else
+            return atoi(squant_str.c_str()) != 0 ? true : false;
+    }();
    float range_q = arg_parser.get_float("range_q");
    float range_k = arg_parser.get_float("range_k");
@@ -217,7 +231,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    bias_info bias = bias_info::decode(arg_parser.get_str("bias"));
    mask_info mask = mask_info::decode(arg_parser.get_str("mask"), seqlen_q, seqlen_k);
-    int init_method              = arg_parser.get_int("init");
+    std::string init_method      = arg_parser.get_str("init");
    std::optional<uint32_t> seed = arg_parser.get_uint32("seed");
    if(*seed == 0)
    {
@@ -319,28 +333,43 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::HostTensor<ODataType> o_host(
        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
-    if(init_method == 0)
+    if(init_method == "ui" || init_method == "0")
    {
-        ck_tile::FillUniformDistributionIntegerValue<QDataType>{-2.f, 2.f, seed}(q_host);
+        ck_tile::FillUniformDistributionIntegerValue<QDataType>{-3.f, 3.f, seed}(q_host);
-        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-2.f, 2.f, seed}(k_host);
+        ck_tile::FillUniformDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(k_host);
-        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-2.f, 2.f, seed}(v_host);
+        ck_tile::FillUniformDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(v_host);
-        ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-2.f, 2.f, seed}(bias_host);
+        ck_tile::FillUniformDistributionIntegerValue<BiasDataType>{-3.f, 3.f, seed}(bias_host);
    }
-    else if(init_method == 1)
+    else if(init_method == "ni")
+    {
+        ck_tile::FillNormalDistributionIntegerValue<QDataType>{-3.f, 3.f, seed}(q_host);
+        ck_tile::FillNormalDistributionIntegerValue<KDataType>{-3.f, 3.f, seed}(k_host);
+        ck_tile::FillNormalDistributionIntegerValue<VDataType>{-3.f, 3.f, seed}(v_host);
+        ck_tile::FillNormalDistributionIntegerValue<BiasDataType>{-3.f, 3.f, seed}(bias_host);
+    }
+    else if(init_method == "uf" || init_method == "1")
    {
        ck_tile::FillUniformDistribution<QDataType>{0.f, 1.f, seed}(q_host);
        ck_tile::FillUniformDistribution<KDataType>{0.f, 1.f, seed}(k_host);
        ck_tile::FillUniformDistribution<VDataType>{0.f, 1.f, seed}(v_host);
        ck_tile::FillUniformDistribution<BiasDataType>{0.f, 1.f, seed}(bias_host);
    }
-    else if(init_method == 2)
+    else if(init_method == "nf")
+    {
+        ck_tile::FillNormalDistribution<QDataType>{0.f, 3.f, seed}(q_host);
+        ck_tile::FillNormalDistribution<KDataType>{0.f, 3.f, seed}(k_host);
+        ck_tile::FillNormalDistribution<VDataType>{0.f, 3.f, seed}(v_host);
+        ck_tile::FillNormalDistribution<BiasDataType>{0.f, 3.f, seed}(bias_host);
+    }
+    else if(init_method == "tf" || init_method == "2")
    {
        ck_tile::FillTrigValue<QDataType>{}(q_host);
        ck_tile::FillTrigValue<KDataType>{}(k_host);
        ck_tile::FillTrigValue<VDataType>{}(v_host);
        ck_tile::FillTrigValue<BiasDataType>{}(bias_host);
    }
-    else if(init_method == 3) // suitable for fp8 quantization
+    else if(init_method == "ufq" || init_method == "uf:q" ||
+            init_method == "3") // suitable for fp8 quantization
    {
        ck_tile::FillUniformDistribution<QDataType>{-dtype_max, dtype_max, seed}(q_host);
        ck_tile::FillUniformDistribution<KDataType>{-dtype_max, dtype_max, seed}(k_host);

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -4,12 +4,19 @@
 #pragma once
 #include "ck/config.h"
+#include "ck/utility/env.hpp"
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
+// environment variable to enable logging:
+// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
+CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+// to do: add various levels of logging with CK_LOG_LEVEL
 #define CK_TIME_KERNEL 1
 // constant address space for kernel parameter
@@ -225,14 +232,11 @@
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1
-// flag to enable (1) or disable (0) the debugging output in some kernels
-#define DEBUG_LOG 0
 // denorm test fix, required to work around dissue
 #ifndef CK_WORKAROUND_DENORM_FIX
 #define CK_WORKAROUND_DENORM_FIX 0
 #else
-// enable only on MI200
+// enable only for gfx90a
 #define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
 #endif // CK_WORKAROUND_DENORM_FIX

--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -65,20 +65,20 @@ inline bool is_lds_direct_load_supported()
           ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942";
 }
-inline bool is_navi1_supported()
+inline bool is_gfx101_supported()
 {
    return ck::get_device_name() == "gfx1010" || ck::get_device_name() == "gfx1011" ||
           ck::get_device_name() == "gfx1012";
 }
-inline bool is_navi2_supported()
+inline bool is_gfx103_supported()
 {
    return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" ||
           ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" ||
           ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
 }
-inline bool is_navi3_supported()
+inline bool is_gfx11_supported()
 {
    return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";

--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -117,18 +117,19 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
 #define MEDIAN 1
    if(stream_config.time_kernel_)
    {
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+        {
-               __func__,
+            printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
-               grid_dim.x,
+                   __func__,
-               grid_dim.y,
+                   grid_dim.x,
-               grid_dim.z,
+                   grid_dim.y,
-               block_dim.x,
+                   grid_dim.z,
-               block_dim.y,
+                   block_dim.x,
-               block_dim.z);
+                   block_dim.y,
+                   block_dim.z);
-        printf("Warm up %d times\n", stream_config.cold_niters_);
-#endif
+            printf("Warm up %d times\n", stream_config.cold_niters_);
+        }
        // warm up
        for(int i = 0; i < stream_config.cold_niters_; ++i)
        {
@@ -141,9 +142,10 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        {
            return 0.0;
        }
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        printf("Start running %d times...\n", nrepeat);
+        {
-#endif
+            printf("Start running %d times...\n", nrepeat);
+        }
 #if MEDIAN
        std::set<float> times;
@@ -184,13 +186,14 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
            total_time += cur_time;
 #endif
-#if DEBUG_LOG
+            if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-            std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
+            {
+                std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
-            printf("args.p_a_grid: %p, args.p_b_grid:%p\n",
+                printf("args.p_a_grid: %p, args.p_b_grid:%p\n",
-                   static_cast<const void*>(args.p_a_grid),
+                       static_cast<const void*>(args.p_a_grid),
-                   static_cast<const void*>(args.p_b_grid));
+                       static_cast<const void*>(args.p_b_grid));
-#endif
+            }
        }
 #if MEDIAN

--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -20,18 +20,19 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
 #if CK_TIME_KERNEL
    if(stream_config.time_kernel_)
    {
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+        {
-               __func__,
+            printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
-               grid_dim.x,
+                   __func__,
-               grid_dim.y,
+                   grid_dim.x,
-               grid_dim.z,
+                   grid_dim.y,
-               block_dim.x,
+                   grid_dim.z,
-               block_dim.y,
+                   block_dim.x,
-               block_dim.z);
+                   block_dim.y,
+                   block_dim.z);
-        printf("Warm up %d times\n", stream_config.cold_niters_);
-#endif
+            printf("Warm up %d times\n", stream_config.cold_niters_);
+        }
        // warm up
        for(int i = 0; i < stream_config.cold_niters_; ++i)
        {
@@ -40,9 +41,10 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        }
        const int nrepeat = stream_config.nrepeat_;
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        printf("Start running %d times...\n", nrepeat);
+        {
-#endif
+            printf("Start running %d times...\n", nrepeat);
+        }
        hipEvent_t start, stop;
        hip_check_error(hipEventCreate(&start));
@@ -93,18 +95,19 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
 #if CK_TIME_KERNEL
    if(stream_config.time_kernel_)
    {
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+        {
-               __func__,
+            printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
-               grid_dim.x,
+                   __func__,
-               grid_dim.y,
+                   grid_dim.x,
-               grid_dim.z,
+                   grid_dim.y,
-               block_dim.x,
+                   grid_dim.z,
-               block_dim.y,
+                   block_dim.x,
-               block_dim.z);
+                   block_dim.y,
+                   block_dim.z);
-        printf("Warm up %d times\n", stream_config.cold_niters_);
-#endif
+            printf("Warm up %d times\n", stream_config.cold_niters_);
+        }
        // warm up
        preprocess();
        for(int i = 0; i < stream_config.cold_niters_; ++i)
@@ -114,9 +117,10 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        }
        const int nrepeat = stream_config.nrepeat_;
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        printf("Start running %d times...\n", nrepeat);
+        {
-#endif
+            printf("Start running %d times...\n", nrepeat);
+        }
        hipEvent_t start, stop;
        hip_check_error(hipEventCreate(&start));