Adjust block_size for navi

1e02e941 · Paul · 308db690 · 1e02e941 · 1e02e941 · 1e02e941
Commit 1e02e941 authored Nov 13, 2023 by Paul
4 changed files
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -164,9 +164,9 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    };
 }
-std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
+std::size_t compute_block_size(context& ctx, std::size_t n, std::size_t max_block_size)
 {
-    const std::size_t min_block_size = 64;
+    const std::size_t min_block_size = ctx.get_current_device().get_wavefront_size();
    auto block_size                  = (((n - 1) / min_block_size + 1)) * min_block_size;
    return std::min(std::max(min_block_size, block_size), max_block_size);
 }

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -72,7 +72,7 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
 MIGRAPHX_GPU_EXPORT operation compile_hip_code_object(const std::string& content,
                                                      hip_compile_options options);
-MIGRAPHX_GPU_EXPORT std::size_t compute_block_size(std::size_t n,
+MIGRAPHX_GPU_EXPORT std::size_t compute_block_size(context& ctx, std::size_t n,
                                                   std::size_t max_block_size = 1024);
 MIGRAPHX_GPU_EXPORT std::string generate_make_shape(const shape& s);

--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -166,7 +166,7 @@ struct simple_reduce_compiler : compiler<simple_reduce_compiler>
            auto relements  = get_reduce_elements(options.virtual_inputs) / vec.size;
            if(algo == "block")
            {
-                auto block_size = compute_block_size(relements, 256);
+                auto block_size = compute_block_size(ctx, relements, 256);
                if(relements >= block_size * 256)
                    algo = "block_large";
                options.set_launch_params(
@@ -274,7 +274,7 @@ struct fused_reduce_compiler : compiler<fused_reduce_compiler>
            auto relements  = reduction_shape.elements() / vec.size;
            if(algo == "block")
            {
-                auto block_size = compute_block_size(relements, 256);
+                auto block_size = compute_block_size(ctx, relements, 256);
                if(relements >= block_size * 256)
                    algo = "block_large";
                options.set_launch_params(

--- a/src/targets/gpu/jit/softmax.cpp
+++ b/src/targets/gpu/jit/softmax.cpp
@@ -75,7 +75,7 @@ struct softmax_compiler : compiler<softmax_compiler>
        }
        auto relements  = inputs[0].lens()[axis] / vec.size;
        auto nelements  = (inputs.back().elements() / inputs[0].lens()[axis]);
-        auto block_size = compute_block_size(relements, 256);
+        auto block_size = compute_block_size(ctx, relements, 256);
        hip_compile_options options;
        options.set_launch_params(
            v, compute_global_for(ctx, nelements * block_size, 256), block_size);