Merge branch 'develop' into codegen_hiprtc

760ea189 · arai713 · GitHub · c87aa6c8 · cb8c7f42 · 760ea189
Unverified Commit 760ea189 authored Nov 26, 2024 by arai713 Committed by GitHub Nov 26, 2024
9 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -116,7 +116,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'" && \
    sh -c "echo compiler commit = '$compiler_commit'"
-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
@@ -124,7 +124,7 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd
    else echo "using the release compiler"; \
    fi
-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -133,7 +133,7 @@ def buildDocker(install_prefix){
    def image_name = getDockerImageName()
    echo "Building Docker for ${image_name}"
    def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
-    if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
+    if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
        dockerArgs = dockerArgs + " --no-cache "
    }
    echo "Build Args: ${dockerArgs}"
@@ -358,7 +358,7 @@ def buildHipClangJob(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
+        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
        def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
@@ -549,7 +549,7 @@ def Build_CK(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
+        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
        if(params.BUILD_LEGACY_OS){
@@ -737,7 +737,7 @@ def process_results(Map conf=[:]){
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true
                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
                                              0 13 * * * % BUILD_LEGACY_OS=true''' : ""
@@ -765,7 +765,7 @@ pipeline {
        string(
            name: 'COMPILER_VERSION', 
            defaultValue: '', 
-            description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline-open, or leave blank (default).')
+            description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).')
        string(
            name: 'COMPILER_COMMIT', 
            defaultValue: '', 

--- a/example/ck_tile/12_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -18,7 +18,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC)
    target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS})
 endfunction(add_smoothquant_example TARGET_NAME MAIN_SRC)
-file(GLOB INSTANCE_SRCS instances/*.cpp)
-add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS})
 add_smoothquant_example(tile_example_smoothquant example_smoothquant.cpp)
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS})
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
@@ -269,15 +269,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                               a_thread_desc_,
                                               make_tuple(m0, I0, k, I0),
                                               a_thread_buf);
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        });
-                                b_thread_copy_.Run(
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                    b_block_desc_n0_n1_n2_k,
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                    b_block_buf,
+                                               b_block_buf,
-                                    b_thread_desc_,
+                                               b_thread_desc_,
-                                    make_tuple(n0, I0, k, I0),
+                                               make_tuple(n0, I0, k, I0),
-                                    b_thread_buf);
+                                               b_thread_buf);
-                            });
                        });
                    });
@@ -341,14 +340,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                           a_thread_desc_,
                                           make_tuple(m0, I0, k, I0),
                                           a_thread_buf);
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    });
-                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                               b_block_buf,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                               b_thread_desc_,
+                                           b_block_buf,
-                                               make_tuple(n0, I0, k, I0),
+                                           b_thread_desc_,
-                                               b_thread_buf);
+                                           make_tuple(n0, I0, k, I0),
-                        });
+                                           b_thread_buf);
                    });
                });
@@ -396,14 +395,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                });
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           b_block_buf,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_thread_desc_,
+                                       b_block_buf,
-                                           make_tuple(n0, I0, k, I0),
+                                       b_thread_desc_,
-                                           b_thread_buf);
+                                       make_tuple(n0, I0, k, I0),
-                    });
+                                       b_thread_buf);
                });
            });
@@ -447,14 +446,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                });
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           b_block_buf,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_thread_desc_,
+                                       b_block_buf,
-                                           make_tuple(n0, I0, k, I0),
+                                       b_thread_desc_,
-                                           b_thread_buf);
+                                       make_tuple(n0, I0, k, I0),
-                    });
+                                       b_thread_buf);
                });
            });
@@ -760,15 +759,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                               a_thread_desc_,
                                               make_tuple(m0, I0, k0, I0),
                                               a_thread_buf);
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        });
-                                b_thread_copy_.Run(
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                    b_block_desc_n0_n1_n2_k,
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                    b_block_buf,
+                                               b_block_buf,
-                                    b_thread_desc_,
+                                               b_thread_desc_,
-                                    make_tuple(n0, I0, k0, I0),
+                                               make_tuple(n0, I0, k0, I0),
-                                    b_thread_buf);
+                                               b_thread_buf);
-                            });
                        });
                        __builtin_amdgcn_sched_barrier(0);
                        // NOTE: Synchronize threads in a workgroup at the start of each MAC
@@ -866,14 +864,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                           a_thread_desc_,
                                           make_tuple(m0, I0, k0, I0),
                                           a_thread_buf);
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    });
-                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                               b_block_buf,
+                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                               b_thread_desc_,
+                                           b_block_buf,
-                                               make_tuple(n0, I0, k0, I0),
+                                           b_thread_desc_,
-                                               b_thread_buf);
+                                           make_tuple(n0, I0, k0, I0),
-                        });
+                                           b_thread_buf);
                    });
                    __builtin_amdgcn_sched_barrier(0);
@@ -942,14 +940,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k0, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                });
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           b_block_buf,
+                                       make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                           b_thread_desc_,
+                                       b_block_buf,
-                                           make_tuple(n0, I0, k0, I0),
+                                       b_thread_desc_,
-                                           b_thread_buf);
+                                       make_tuple(n0, I0, k0, I0),
-                    });
+                                       b_thread_buf);
                });
                __builtin_amdgcn_sched_barrier(0);
@@ -1018,14 +1016,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k0, I0),
                                       a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                });
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           b_block_buf,
+                                       make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                           b_thread_desc_,
+                                       b_block_buf,
-                                           make_tuple(n0, I0, k0, I0),
+                                       b_thread_desc_,
-                                           b_thread_buf);
+                                       make_tuple(n0, I0, k0, I0),
-                    });
+                                       b_thread_buf);
                });
                __builtin_amdgcn_sched_barrier(0);

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
@@ -305,14 +305,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                   a_thread_desc_,
                                   make_tuple(m0, I0, k, I0),
                                   a_thread_bufs(I0));
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
+            });
-                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                       b_block_buf.At(I0),
+                                   make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                       b_thread_desc_,
+                                   b_block_buf.At(I0),
-                                       make_tuple(n0, I0, k, I0),
+                                   b_thread_desc_,
-                                       b_thread_bufs(I0));
+                                   make_tuple(n0, I0, k, I0),
-                });
+                                   b_thread_bufs(I0));
            });
        });
@@ -356,15 +356,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                               a_thread_desc_,
                                               make_tuple(m0, I0, k, I0),
                                               a_thread_bufs(lds_read_reg_buf));
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        });
-                                b_thread_copy_.Run(
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                    b_block_desc_n0_n1_n2_k,
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                    b_block_buf.At(lds_read_buf),
+                                               b_block_buf.At(lds_read_buf),
-                                    b_thread_desc_,
+                                               b_thread_desc_,
-                                    make_tuple(n0, I0, k, I0),
+                                               make_tuple(n0, I0, k, I0),
-                                    b_thread_bufs(lds_read_reg_buf));
+                                               b_thread_bufs(lds_read_reg_buf));
-                            });
                        });
                    });
@@ -437,14 +436,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_bufs(lds_read_reg_buf));
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                });
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           b_block_buf.At(lds_read_buf),
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_thread_desc_,
+                                       b_block_buf.At(lds_read_buf),
-                                           make_tuple(n0, I0, k, I0),
+                                       b_thread_desc_,
-                                           b_thread_bufs(lds_read_reg_buf));
+                                       make_tuple(n0, I0, k, I0),
-                    });
+                                       b_thread_bufs(lds_read_reg_buf));
                });
            });
@@ -496,14 +495,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                       a_thread_desc_,
                                       make_tuple(m0, I0, k, I0),
                                       a_thread_bufs(lds_read_reg_buf));
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                });
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           b_block_buf.At(lds_read_buf),
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_thread_desc_,
+                                       b_block_buf.At(lds_read_buf),
-                                           make_tuple(n0, I0, k, I0),
+                                       b_thread_desc_,
-                                           b_thread_bufs(lds_read_reg_buf));
+                                       make_tuple(n0, I0, k, I0),
-                    });
+                                       b_thread_bufs(lds_read_reg_buf));
                });
            });

--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
@@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
    static constexpr bool kSaveX    = Problem::kSaveX;
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
-    static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
+    static constexpr bool kPadM   = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
-    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadN   = Problem::kPadN;
+    static constexpr bool UseMax3 = true; // TODO - Move to trait
    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -69,9 +70,16 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
        auto reduce_square_sum_func = ReduceOp::SquareAdd{};
        auto reduce_sum_func        = ReduceOp::Add{};
        auto reduce_absmax_func     = ReduceOp::AbsMax{};
-        auto reduce_max_func        = ReduceOp::Max{};
+        auto reduce_absmax3_func    = [](auto acc_, auto v_0_, auto v_1_) {
-        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
+            float rtn;
-        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
        auto block_reduce2d_cross_warp_sync =
            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
@@ -116,8 +124,23 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
        });
        // compute absmax, each-thread->cross-lane->cross-warp
-        auto absmax = block_reduce2d(
+        auto absmax = [&]() {
-            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+            {
+                return block_reduce2d(y,
+                                      reduce_absmax_func.GetIdentityValue<ComputeDataType>(),
+                                      reduce_absmax3_func,
+                                      sequence<1, 2>{});
+            }
+            else
+            {
+                return block_reduce2d(
+                    y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            }
+        }();
        block_reduce2d_sync(absmax, reduce_max_func);
        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);

--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
@@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
    static constexpr bool kSaveX    = Problem::kSaveX;
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
-    static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
+    static constexpr bool kPadM   = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
-    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadN   = Problem::kPadN;
+    static constexpr bool UseMax3 = true; // TODO - Move to trait
    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -76,9 +77,16 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
        auto reduce_square_sum_func = ReduceOp::SquareAdd{};
        auto reduce_sum_func        = ReduceOp::Add{};
        auto reduce_absmax_func     = ReduceOp::AbsMax{};
-        auto reduce_max_func        = ReduceOp::Max{};
+        auto reduce_absmax3_func    = [](auto acc_, auto v_0_, auto v_1_) {
-        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
+            float rtn;
-        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
        auto block_reduce2d_cross_warp_sync =
            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
@@ -177,7 +185,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
                y(idx) = type_convert<ComputeDataType>(y_);
            });
-            block_reduce2d(y, absmax, reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+                block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{});
+            else
+                block_reduce2d(y, absmax, reduce_absmax_func);
            if constexpr(kSaveX)
                move_tile_window(x_window, {0, -Block_N});

--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineOnePass
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait
    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -52,7 +53,15 @@ struct SmoothquantPipelineOnePass
            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
        auto reduce_absmax_func  = ReduceOp::AbsMax{};
-        auto reduce_max_func     = ReduceOp::Max{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func = ReduceOp::Max{};
        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
        auto block_reduce2d_cross_warp_sync =
@@ -68,8 +77,23 @@ struct SmoothquantPipelineOnePass
            xscale);
        // compute absmax, cross-lane->cross-warp
-        auto absmax = block_reduce2d(
+        auto absmax = [&]() {
-            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+            {
+                return block_reduce2d(y,
+                                      reduce_absmax_func.GetIdentityValue<ComputeDataType>(),
+                                      reduce_absmax3_func,
+                                      sequence<1, 2>{});
+            }
+            else
+            {
+                return block_reduce2d(
+                    y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            }
+        }();
        block_reduce2d_sync(absmax, reduce_max_func);
        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);

--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineTwoPass
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait
    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -56,6 +57,13 @@ struct SmoothquantPipelineTwoPass
            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
        auto reduce_max_func     = ReduceOp::Max{};
        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
@@ -77,7 +85,13 @@ struct SmoothquantPipelineTwoPass
                x,
                xscale);
-            block_reduce2d(y, absmax, reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+                block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{});
+            else
+                block_reduce2d(y, absmax, reduce_absmax_func);
            move_tile_window(x_window, {0, Block_N});
            move_tile_window(xscale_window, {Block_N});