Improve layernorm performance (#613)

* Use increment instead of division to compute register offset * Formatting * Limit layernorm to 1024 elements * Formatting * Add verification to driver * Formatting * Remove early return * Use block_size 256 * Vectorize the kernel * Formatting * Convert to vector type * Add layernorm tests * Formatting * Formatting * Refactor layernorm to run both algos * Formatting * Fix compile error * Fix tidy warnings * Formatting * Add layernorm function * Formatting

Improve layernorm performance (#613)
* Use increment instead of division to compute register offset * Formatting * Limit layernorm to 1024 elements * Formatting * Add verification to driver * Formatting * Remove early return * Use block_size 256 * Vectorize the kernel * Formatting * Convert to vector type * Add layernorm tests * Formatting * Formatting * Refactor layernorm to run both algos * Formatting * Fix compile error * Fix tidy warnings * Formatting * Add layernorm function * Formatting
56b3bf58 · Paul Fultz II · GitHub · 1ca3c133 · 56b3bf58 · 56b3bf58
Unverified Commit 56b3bf58 authored Aug 25, 2020 by Paul Fultz II Committed by GitHub Aug 25, 2020
10 changed files
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -33,6 +33,7 @@ add_library(migraphx
    simplify_algebra.cpp
    simplify_reshapes.cpp
    value.cpp
+    verify_args.cpp
    json.cpp
    opt/memory_coloring.cpp
    opt/memory_coloring_impl.cpp

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -115,11 +115,34 @@ struct loader
    }
 };

+struct program_params
+{
+    std::vector<std::string> fill0{};
+    std::vector<std::string> fill1{};
+    void parse(argument_parser& ap)
+    {
+        ap(fill0, {"--fill0"}, ap.help("Fill parameter with 0s"), ap.append());
+        ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append());
+    }
+
+    auto generate(const program& p, bool use_gpu)
+    {
+        program::parameter_map m;
+        for(auto&& s : fill0)
+            m[s] = fill_argument(p.get_parameter_shape(s), 0);
+        for(auto&& s : fill1)
+            m[s] = fill_argument(p.get_parameter_shape(s), 1);
+        fill_param_map(m, p, use_gpu);
+        return m;
+    }
+};
+
 struct compiler
 {
    static const int q_fp16 = 1;
    static const int q_int8 = 2;
    loader l;
+    program_params parameters;
    bool gpu          = true;
    bool offload_copy = false;
    int quantize      = 0;
@@ -129,6 +152,7 @@ struct compiler
    void parse(argument_parser& ap)
    {
        l.parse(ap);
+        parameters.parse(ap);
        ap(gpu, {"--gpu"}, ap.help("Compile on the gpu"), ap.set_value(true));
        ap(gpu, {"--cpu"}, ap.help("Compile on the cpu"), ap.set_value(false));
        ap(offload_copy,
@@ -137,20 +161,11 @@ struct compiler
           ap.set_value(true));
        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(q_fp16));
        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(q_int8));
-        ap(fill0, {"--fill0"}, ap.help("Fill parameter with 0s"), ap.append());
-        ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append());
    }

    auto params(const program& p, bool use_gpu = true)
    {
-        bool gpu_flag = use_gpu && gpu && !offload_copy;
-        program::parameter_map m;
-        for(auto&& s : fill0)
-            m[s] = fill_argument(p.get_parameter_shape(s), 0);
-        for(auto&& s : fill1)
-            m[s] = fill_argument(p.get_parameter_shape(s), 1);
-        fill_param_map(m, p, gpu_flag);
-        return m;
+        return parameters.generate(p, use_gpu && gpu && !offload_copy);
    }

    program compile()
@@ -228,6 +243,7 @@ struct params : command<params>
 struct verify : command<verify>
 {
    loader l;
+    program_params parameters;
    double tolerance     = 80;
    bool per_instruction = false;
    bool reduce          = false;
@@ -235,6 +251,7 @@ struct verify : command<verify>
    void parse(argument_parser& ap)
    {
        l.parse(ap);
+        parameters.parse(ap);
        ap(offload_copy,
           {"--enable-offload-copy"},
           ap.help("Enable implicit offload copying"),
@@ -255,17 +272,19 @@ struct verify : command<verify>
        compile_options options;
        options.offload_copy = offload_copy;

+        auto m = parameters.generate(p, false);
+
        if(per_instruction)
        {
            verify_instructions(p, options, tolerance);
        }
        else if(reduce)
        {
-            verify_reduced_program(p, options, tolerance);
+            verify_reduced_program(p, options, m, tolerance);
        }
        else
        {
-            verify_program(l.file, p, options, tolerance);
+            verify_program(l.file, p, options, m, tolerance);
        }
    }
 };

--- a/src/driver/perf.cpp
+++ b/src/driver/perf.cpp
@@ -11,13 +11,19 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {

+template <class T>
+auto get_hash(const T& x)
+{
+    return std::hash<T>{}(x);
+}
+
 program::parameter_map fill_param_map(program::parameter_map& m, const program& p, bool gpu)
 {
    for(auto&& x : p.get_parameter_shapes())
    {
        argument& arg = m[x.first];
        if(arg.empty())
-            arg = generate_argument(x.second);
+            arg = generate_argument(x.second, get_hash(x.first));
 #ifdef HAVE_GPU
        if(gpu)
            arg = gpu::to_gpu(arg);
@@ -35,12 +41,12 @@ program::parameter_map create_param_map(const program& p, bool gpu)
    {
 #ifdef HAVE_GPU
        if(gpu)
-            m[x.first] = gpu::to_gpu(generate_argument(x.second));
+            m[x.first] = gpu::to_gpu(generate_argument(x.second, get_hash(x.first)));
        else
 #else
        (void)gpu;
 #endif
-            m[x.first] = generate_argument(x.second);
+            m[x.first] = generate_argument(x.second, get_hash(x.first));
    }
    return m;
 }

--- a/src/driver/verify.cpp
+++ b/src/driver/verify.cpp
 #include "verify.hpp"
+#include "perf.hpp"

 #include <migraphx/cpu/target.hpp>
 #include <migraphx/generate.hpp>
@@ -15,26 +16,16 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {

-template <class T>
-auto get_hash(const T& x)
-{
-    return std::hash<T>{}(x);
-}
-
-std::vector<argument> run_cpu(program p)
+std::vector<argument> run_cpu(program p, const program::parameter_map& inputs)
 {
    p.compile(cpu::target{});
-    program::parameter_map m;
-    for(auto&& x : p.get_parameter_shapes())
-    {
-        m[x.first] = generate_argument(x.second, get_hash(x.first));
-    }
-    auto out = p.eval(m);
+    auto out = p.eval(inputs);
    std::cout << p << std::endl;
    return out;
 }

-std::vector<argument> run_gpu(program p, const compile_options& options)
+std::vector<argument>
+run_gpu(program p, const compile_options& options, const program::parameter_map& inputs)
 {
 #ifdef HAVE_GPU
    p.compile(gpu::target{}, options);
@@ -42,7 +33,7 @@ std::vector<argument> run_gpu(program p, const compile_options& options)
    program::parameter_map m;
    for(auto&& x : p.get_parameter_shapes())
    {
-        auto arg   = generate_argument(x.second, get_hash(x.first));
+        auto arg   = inputs.count(x.first) == 0 ? generate_argument(x.second) : inputs.at(x.first);
        m[x.first] = options.offload_copy ? arg : gpu::to_gpu(arg);
    }
    auto gpu_out = p.eval(m);
@@ -56,6 +47,7 @@ std::vector<argument> run_gpu(program p, const compile_options& options)
 #else
    (void)p;
    (void)options;
+    (void)inputs;
    MIGRAPHX_THROW("Gpu unsupported!");
 #endif
 }
@@ -63,10 +55,11 @@ std::vector<argument> run_gpu(program p, const compile_options& options)
 void verify_program(const std::string& name,
                    const program& p,
                    compile_options options,
+                    const program::parameter_map& inputs,
                    double tolerance)
 {
-    auto x = run_cpu(p);
-    auto y = run_gpu(p, options);
+    auto x = run_cpu(p, inputs);
+    auto y = run_gpu(p, options, inputs);

    std::size_t output_num = x.size();
    for(std::size_t i = 0; i < output_num; ++i)
@@ -105,7 +98,7 @@ void verify_instructions(const program& prog, compile_options options, double to
        {
            std::cout << "Verify: " << ins.name() << std::endl;
            std::cout << p << std::endl;
-            verify_program(ins.name(), p, options, tolerance);
+            verify_program(ins.name(), p, options, create_param_map(p, false), tolerance);
        }
        catch(...)
        {
@@ -115,21 +108,28 @@ void verify_instructions(const program& prog, compile_options options, double to
    }
 }

-void verify_reduced(program p, int n, compile_options options, double tolerance)
+void verify_reduced(program p,
+                    int n,
+                    compile_options options,
+                    const program::parameter_map& inputs,
+                    double tolerance)
 {
    auto last = std::prev(p.end(), n + 1);
    p.remove_instructions(last, p.end());
    std::cout << "Verify: " << std::endl;
    std::cout << p << std::endl;
-    verify_program(std::to_string(n), p, options, tolerance);
+    verify_program(std::to_string(n), p, options, inputs, tolerance);
 }

-void verify_reduced_program(const program& p, compile_options options, double tolerance)
+void verify_reduced_program(const program& p,
+                            compile_options options,
+                            const program::parameter_map& inputs,
+                            double tolerance)
 {
    auto n = std::distance(p.begin(), p.end());
    for(std::size_t i = 0; i < n; i++)
    {
-        verify_reduced(p, i, options, tolerance);
+        verify_reduced(p, i, options, inputs, tolerance);
    }
 }


--- a/src/driver/verify.hpp
+++ b/src/driver/verify.hpp
@@ -11,14 +11,16 @@ std::vector<argument> run_cpu(program p);
 std::vector<argument> run_gpu(program p);
 void verify_program(const std::string& name,
                    const program& p,
-                    compile_options options = compile_options{},
-                    double tolerance        = 100);
+                    compile_options options              = compile_options{},
+                    const program::parameter_map& inputs = {},
+                    double tolerance                     = 100);
 void verify_instructions(const program& prog,
                         compile_options options = compile_options{},
                         double tolerance        = 80);
 void verify_reduced_program(const program& p,
-                            compile_options options = compile_options{},
-                            double tolerance        = 80);
+                            compile_options options              = compile_options{},
+                            const program::parameter_map& inputs = {},
+                            double tolerance                     = 80);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace driver

--- a/src/include/migraphx/verify_args.hpp
+++ b/src/include/migraphx/verify_args.hpp
@@ -8,81 +8,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-inline bool verify_args(const std::string& name,
-                        const argument& cpu_arg,
-                        const argument& gpu_arg,
-                        double tolerance = 80)
-{
-    bool passed = true;
-    visit_all(cpu_arg, gpu_arg)([&](auto cpu, auto gpu) {
-        double error;
-        passed = verify_range(cpu, gpu, tolerance, &error);
-        if(not passed)
-        {
-            // TODO: Check for nans
-            std::cout << "FAILED: " << name << std::endl;
-            std::cout << "error: " << error << std::endl;
-            if(cpu.size() < 32)
-                std::cout << "cpu:" << cpu << std::endl;
-            if(gpu.size() < 32)
-                std::cout << "gpu:" << gpu << std::endl;
-            if(range_zero(cpu))
-                std::cout << "Cpu data is all zeros" << std::endl;
-            if(range_zero(gpu))
-                std::cout << "Gpu data is all zeros" << std::endl;
-
-            auto mxdiff = max_diff(cpu, gpu);
-            std::cout << "Max diff: " << mxdiff << std::endl;
-
-            auto idx = mismatch_idx(cpu, gpu, float_equal);
-            if(idx < range_distance(cpu))
-            {
-                std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
-                          << std::endl;
-            }
-
-            auto cpu_nan_idx = find_idx(cpu, not_finite);
-            if(cpu_nan_idx >= 0)
-                std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
-                          << cpu[cpu_nan_idx] << std::endl;
-
-            auto gpu_nan_idx = find_idx(gpu, not_finite);
-            if(gpu_nan_idx >= 0)
-                std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
-                          << gpu[gpu_nan_idx] << std::endl;
-            std::cout << std::endl;
-        }
-        else
-        {
-            if(range_zero(cpu))
-                std::cout << "Cpu data is all zeros" << std::endl;
-            if(range_zero(gpu))
-                std::cout << "Gpu data is all zeros" << std::endl;
-
-            // auto mxdiff = max_diff(cpu, gpu);
-            // std::cout << "Max diff: " << mxdiff << std::endl;
-
-            // auto idx = mismatch_idx(cpu, gpu, float_equal);
-            // if(idx < range_distance(cpu))
-            // {
-            //     std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
-            //               << std::endl;
-            // }
-
-            auto cpu_nan_idx = find_idx(cpu, not_finite);
-            if(cpu_nan_idx >= 0)
-                std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
-                          << cpu[cpu_nan_idx] << std::endl;
-
-            auto gpu_nan_idx = find_idx(gpu, not_finite);
-            if(gpu_nan_idx >= 0)
-                std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
-                          << gpu[gpu_nan_idx] << std::endl;
-            // std::cout << std::endl;
-        }
-    });
-    return passed;
-}
+bool verify_args(const std::string& name,
+                 const argument& cpu_arg,
+                 const argument& gpu_arg,
+                 double tolerance = 80);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -8,57 +8,132 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

+template <class T>
+struct vector_type
+{
+};
+
+template <class T, index_int N>
+struct vector_type<vec<T, N>>
+{
+    using type = T;
+};
+
+template <class T>
+using vector_type_t = typename vector_type<T>::type;
+
 // m = x - mean(x)
 // m / sqrt(mean(m ^ 2) + 1e-12)
-void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
+
+template <index_int N>
+void layernorm_vec_impl(hipStream_t stream,
+                        const argument& result,
+                        const argument& arg1,
+                        index_int nelements,
+                        index_int relements)
 {
-    auto relements = arg1.get_shape().lens().back();
-    assert(relements <= 1024);
-    auto nelements    = result.get_shape().elements() / relements;
-    auto input_shape  = arg1.get_shape();
-    auto output_shape = result.get_shape();
-    auto reduce_output_lens(output_shape.lens());
-    reduce_output_lens.back() = 1;
+    hip_vec_visit_all<N>(result, arg1)([&](auto output, auto input) {
+        using value_type = typename decltype(input)::value_type;
+
+        const auto relements_v           = relements / N;
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(relements_v, max_block_size);
+        const std::size_t block_size_div = encode_divisor(block_size);
+        assert(relements_v <= block_size);
+
+        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
+            const auto out_idx   = fast_div(i, block_size_div);
+            const auto base_idx  = out_idx * relements_v;
+            const auto input_idx = base_idx + idx.local;
+            const bool in_range  = idx.local < relements_v;
+
+            auto mean = [&](auto z) {
+                auto psum = block_reduce<max_block_size>(
+                    idx, sum{}, value_type(0), relements_v, [=](auto) { return z; });
+                vector_type_t<value_type> sum = 0;
+                for(index_int k = 0; k < N; k++)
+                    sum += psum[k];
+                return sum / relements;
+
+            };

-    std::vector<index_int> reduce_lens = get_reduce_lens(input_shape.lens(), reduce_output_lens);
+            // m = x - mean(x)
+            value_type x = in_range ? input.data()[input_idx] : 0;
+            value_type m = x - mean(x);

+            // mean(m ^ 2) + 1e-12
+            value_type r = mean(m * m) + value_type(1e-12);
+
+            // rsqrt(mean(m ^ 2) + 1e-12)
+            value_type d = 0;
+            for(index_int k = 0; k < N; k++)
+                d[k] = ::rsqrt(r[k]);
+            // m * rsqrt(mean(m ^ 2) + 1e-12)
+            if(in_range)
+                output.data()[input_idx] = m * d;
+        });
+    });
+}
+
+void layernorm_impl(hipStream_t stream,
+                    const argument& result,
+                    const argument& arg1,
+                    index_int nelements,
+                    index_int relements)
+{
    hip_visit_all(result, arg1)([&](auto output, auto input) {
        using value_type = typename decltype(input)::value_type;

        const std::size_t max_block_size = 256;
        const std::size_t block_size     = compute_block_size(relements, max_block_size);
        const std::size_t block_size_div = encode_divisor(block_size);
+        assert(relements <= block_size);

        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
-            const auto out_idx  = i / block_size;
-            const auto base_idx = out_idx * relements;
-            value_type x_data[4];
-            auto x = [&](auto j) -> value_type& {
-                return x_data[fast_div(j - idx.local, block_size_div)];
-            };
-
-            idx.local_stride(relements,
-                             [&](auto j) __device__ { x(j) = input.data()[base_idx + j]; });
+            const auto out_idx   = fast_div(i, block_size_div);
+            const auto base_idx  = out_idx * relements;
+            const auto input_idx = base_idx + idx.local;
+            const bool in_range  = idx.local < relements;

-            auto m = block_reduce<max_block_size>(
-                         idx, sum{}, 0, relements, [&](auto j) __device__ { return x(j); }) /
-                     relements;
-
-            idx.local_stride(relements, [&](auto j) __device__ { x(j) = x(j) - m; });
+            auto mean = [&](auto z) {
+                return block_reduce<max_block_size>(idx,
+                                                    sum{},
+                                                    value_type(0),
+                                                    relements,
+                                                    [=](auto) { return in_range ? z : 0; }) /
+                       relements;
+            };

-            auto r = block_reduce<max_block_size>(
-                         idx, sum{}, 0, relements, [&](auto j) __device__ { return x(j) * x(j); }) /
-                     relements;
+            // m = x - mean(x)
+            value_type x = in_range ? input.data()[input_idx] : 0;
+            value_type m = x - mean(x);

-            idx.local_stride(relements, [&](auto j) __device__ {
-                output.data()[base_idx + j] = x(j) * ::rsqrt(r + 1e-12);
-            });
+            // mean(m ^ 2) + 1e-12
+            value_type r = mean(m * m) + 1e-12;

+            // m * rsqrt(mean(m ^ 2) + 1e-12)
+            if(in_range)
+                output.data()[input_idx] = m * ::rsqrt(r);
        });
-
    });
 }

+void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
+{
+    auto relements    = arg1.get_shape().lens().back();
+    auto nelements    = result.get_shape().elements() / relements;
+    auto output_shape = result.get_shape();
+    auto reduce_output_lens(output_shape.lens());
+    reduce_output_lens.back() = 1;
+
+    if((relements % 4) == 0)
+        layernorm_vec_impl<4>(stream, result, arg1, nelements, relements);
+    else if(relements < 256)
+        layernorm_impl(stream, result, arg1, nelements, relements);
+    else
+        MIGRAPHX_THROW("No kernel for layernorm");
+}
+
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -233,6 +233,8 @@ MIGRAPHX_REGISTER_OP(hip_add_tanh)

 struct hip_layernorm : unary_device<hip_layernorm, &device::layernorm>
 {
+    // Empty finalize to skip dimension reduction
+    void finalize(context&, const shape&, const std::vector<shape>&) {}
 };
 MIGRAPHX_REGISTER_OP(hip_layernorm)

@@ -326,6 +328,15 @@ struct find_layernorm
        auto x_ins = r.instructions["x"];
        auto args  = ins->inputs();

+        // We dont fuse for non-standard layouts
+        if(not x_ins->get_shape().standard())
+            return;
+
+        auto relements = x_ins->get_shape().lens().back();
+
+        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
+            return;
+
        p.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
    }
 };

--- a/src/verify_args.cpp
+++ b/src/verify_args.cpp
+
+#include <migraphx/verify_args.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+bool verify_args(const std::string& name,
+                 const argument& cpu_arg,
+                 const argument& gpu_arg,
+                 double tolerance)
+{
+    bool passed = true;
+    visit_all(cpu_arg, gpu_arg)([&](auto cpu, auto gpu) {
+        double error;
+        passed = verify_range(cpu, gpu, tolerance, &error);
+        if(not passed)
+        {
+            // TODO: Check for nans
+            std::cout << "FAILED: " << name << std::endl;
+            std::cout << "error: " << error << std::endl;
+            if(cpu.size() < 32)
+                std::cout << "cpu:" << cpu << std::endl;
+            if(gpu.size() < 32)
+                std::cout << "gpu:" << gpu << std::endl;
+            if(range_zero(cpu))
+                std::cout << "Cpu data is all zeros" << std::endl;
+            if(range_zero(gpu))
+                std::cout << "Gpu data is all zeros" << std::endl;
+
+            auto mxdiff = max_diff(cpu, gpu);
+            std::cout << "Max diff: " << mxdiff << std::endl;
+
+            auto idx = mismatch_idx(cpu, gpu, float_equal);
+            if(idx < range_distance(cpu))
+            {
+                std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
+                          << std::endl;
+            }
+
+            auto cpu_nan_idx = find_idx(cpu, not_finite);
+            if(cpu_nan_idx >= 0)
+                std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
+                          << cpu[cpu_nan_idx] << std::endl;
+
+            auto gpu_nan_idx = find_idx(gpu, not_finite);
+            if(gpu_nan_idx >= 0)
+                std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
+                          << gpu[gpu_nan_idx] << std::endl;
+            std::cout << std::endl;
+        }
+        else
+        {
+            if(range_zero(cpu))
+                std::cout << "Cpu data is all zeros" << std::endl;
+            if(range_zero(gpu))
+                std::cout << "Gpu data is all zeros" << std::endl;
+
+            // auto mxdiff = max_diff(cpu, gpu);
+            // std::cout << "Max diff: " << mxdiff << std::endl;
+
+            // auto idx = mismatch_idx(cpu, gpu, float_equal);
+            // if(idx < range_distance(cpu))
+            // {
+            //     std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
+            //               << std::endl;
+            // }
+
+            auto cpu_nan_idx = find_idx(cpu, not_finite);
+            if(cpu_nan_idx >= 0)
+                std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
+                          << cpu[cpu_nan_idx] << std::endl;
+
+            auto gpu_nan_idx = find_idx(gpu, not_finite);
+            if(gpu_nan_idx >= 0)
+                std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
+                          << gpu[gpu_nan_idx] << std::endl;
+            // std::cout << std::endl;
+        }
+    });
+    return passed;
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/test/gpu/ops_test.cpp
+++ b/test/gpu/ops_test.cpp
@@ -1028,33 +1028,50 @@ struct test_triadd_tanh : verify_program<test_triadd_tanh>
    }
 };

+migraphx::instruction_ref add_layernorm(migraphx::program& p, std::vector<size_t> dims)
+{
+    auto x = p.add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
+    auto scale =
+        p.add_parameter("scale", migraphx::shape{migraphx::shape::float_type, {dims.back()}});
+    auto bias =
+        p.add_parameter("bias", migraphx::shape{migraphx::shape::float_type, {dims.back()}});
+    auto epsilon  = p.add_literal(1e-12f);
+    auto exponent = p.add_literal(2.0f);
+
+    auto mean            = p.add_instruction(migraphx::op::reduce_mean({2}), x);
+    auto mean_mbcast     = p.add_instruction(migraphx::op::multibroadcast{{dims}}, mean);
+    auto sub             = p.add_instruction(migraphx::op::sub{}, x, mean_mbcast);
+    auto exponent_mbcast = p.add_instruction(migraphx::op::multibroadcast{{dims}}, exponent);
+    auto pow             = p.add_instruction(migraphx::op::pow{}, sub, exponent_mbcast);
+    auto var             = p.add_instruction(migraphx::op::reduce_mean({2}), pow);
+    auto epsilon_mbcast =
+        p.add_instruction(migraphx::op::multibroadcast{{1, dims.at(1), 1}}, epsilon);
+    auto add_epsilon  = p.add_instruction(migraphx::op::add{}, var, epsilon_mbcast);
+    auto sqrt         = p.add_instruction(migraphx::op::sqrt{}, add_epsilon);
+    auto sqrt_mbcast  = p.add_instruction(migraphx::op::multibroadcast{dims}, sqrt);
+    auto div          = p.add_instruction(migraphx::op::div{}, sub, sqrt_mbcast);
+    auto scale_mbcast = p.add_instruction(migraphx::op::multibroadcast{dims}, scale);
+    auto mul          = p.add_instruction(migraphx::op::mul{}, scale_mbcast, div);
+    auto bias_mbcast  = p.add_instruction(migraphx::op::multibroadcast{dims}, bias);
+    return p.add_instruction(migraphx::op::add{}, mul, bias_mbcast);
+}
+
 struct test_layernorm : verify_program<test_layernorm>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
-        std::vector<size_t> dims{1, 1, 5};
-        auto x        = p.add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
-        auto scale    = p.add_parameter("scale", migraphx::shape{migraphx::shape::float_type, {5}});
-        auto bias     = p.add_parameter("bias", migraphx::shape{migraphx::shape::float_type, {5}});
-        auto epsilon  = p.add_literal(1e-12f);
-        auto exponent = p.add_literal(migraphx::literal{
-            migraphx::shape{migraphx::shape::float_type, {1, 1, 5}}, {2, 2, 2, 2, 2}});
-
-        auto mean           = p.add_instruction(migraphx::op::reduce_mean({2}), x);
-        auto mean_mbcast    = p.add_instruction(migraphx::op::multibroadcast{{dims}}, mean);
-        auto sub            = p.add_instruction(migraphx::op::sub{}, x, mean_mbcast);
-        auto pow            = p.add_instruction(migraphx::op::pow{}, sub, exponent);
-        auto var            = p.add_instruction(migraphx::op::reduce_mean({2}), pow);
-        auto epsilon_mbcast = p.add_instruction(migraphx::op::multibroadcast{{1, 1, 1}}, epsilon);
-        auto add_epsilon    = p.add_instruction(migraphx::op::add{}, var, epsilon_mbcast);
-        auto sqrt           = p.add_instruction(migraphx::op::sqrt{}, add_epsilon);
-        auto sqrt_mbcast    = p.add_instruction(migraphx::op::multibroadcast{dims}, sqrt);
-        auto div            = p.add_instruction(migraphx::op::div{}, sub, sqrt_mbcast);
-        auto scale_mbcast   = p.add_instruction(migraphx::op::multibroadcast{dims}, scale);
-        auto mul            = p.add_instruction(migraphx::op::mul{}, scale_mbcast, div);
-        auto bias_mbcast    = p.add_instruction(migraphx::op::multibroadcast{dims}, bias);
-        p.add_instruction(migraphx::op::add{}, mul, bias_mbcast);
+        add_layernorm(p, {1, 1, 5});
+        return p;
+    }
+};
+
+struct test_layernorm2 : verify_program<test_layernorm2>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        add_layernorm(p, {1, 4, 24});
        return p;
    }
 };