Move kernel launch to a seperate header

ed9ff879 · Paul · 0bedc5e8 · ed9ff879 · ed9ff879 · ed9ff879
Commit ed9ff879 authored Aug 23, 2018 by Paul
3 changed files
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -16,7 +16,7 @@ add_library(migraph_device
 rocm_clang_tidy_check(migraph_device)
 target_link_libraries(migraph_device migraph hip::device)
 target_include_directories(migraph_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
-target_include_directories(migraph_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device>)
+target_include_directories(migraph_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

 add_library(migraph_gpu
    eliminate_allocation.cpp

--- a/src/targets/gpu/device/contiguous.cpp
+++ b/src/targets/gpu/device/contiguous.cpp

-#include <hip/hip_runtime.h>
 #include <migraph/gpu/device/contiguous.hpp>
+#include <migraph/gpu/device/launch.hpp>

 namespace migraph {
 namespace gpu {
 namespace device {

-struct index
-{
-    std::size_t global;
-    std::size_t local;
-    std::size_t group;
-};
-
-template <class F>
-__global__ void launcher(F f)
-{
-    index idx{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x};
-    f(idx);
-}
-
-auto launch(std::size_t global, std::size_t local)
-{
-    return [=](auto f) {
-        assert(local > 0);
-        assert(global > 0);
-        using f_type = decltype(f);
-        dim3 nblocks(global / local);
-        dim3 nthreads(local);
-        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, nullptr, f);
-    };
-}
-
 template <class F>
 void visit_tensor_size(std::size_t n, F f)
 {
@@ -86,7 +60,7 @@ struct hip_tensor_descriptor
        for(size_t i = 0; i < NDim; i++)
            strides[i] = strides_ext[i];
    }
-    __device__ __host__ hip_index<NDim> multi(size_t idx)
+    __device__ __host__ hip_index<NDim> multi(size_t idx) const
    {
        hip_index<NDim> result{};
        size_t tidx = idx;
@@ -97,7 +71,7 @@ struct hip_tensor_descriptor
        }
        return result;
    }
-    __device__ __host__ size_t linear(hip_index<NDim> s)
+    __device__ __host__ size_t linear(hip_index<NDim> s) const
    {
        size_t idx = 0;
        for(size_t i = 0; i < NDim; i++)
@@ -117,16 +91,9 @@ void contiguous(shape output_shape, argument arg, argument result)
            hip_tensor_descriptor<ndim> at_desc(output_shape.lens(), output_shape.strides());
            auto* a             = input.data();
            auto* at            = output.data();
-            auto nelements      = s.elements();
-            std::size_t nlocal  = 512;
-            std::size_t nglobal = 512 * nlocal;
-
-            launch(nglobal, nlocal)([=](auto idx) mutable {
-                for(size_t i = idx.global; i < nelements; i += nglobal)
-                {
-                    size_t lidx = a_desc.linear(at_desc.multi(i));
-                    at[i]       = a[lidx];
-                }
+            gs_launch(s.elements())([=](auto i) {
+                size_t lidx = a_desc.linear(at_desc.multi(i));
+                at[i]       = a[lidx];
            });
        });
    });

--- a/src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
+#define MIGRAPH_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
+
+#include <hip/hip_runtime.h>
+
+namespace migraph {
+namespace gpu {
+namespace device {
+
+struct index
+{
+    std::size_t global;
+    std::size_t local;
+    std::size_t group;
+};
+
+template <class F>
+__global__ void launcher(F f)
+{
+    index idx{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x};
+    f(idx);
+}
+
+auto launch(std::size_t global, std::size_t local)
+{
+    return [=](auto f) {
+        assert(local > 0);
+        assert(global > 0);
+        using f_type = decltype(f);
+        dim3 nblocks(global / local);
+        dim3 nthreads(local);
+        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, nullptr, f);
+    };
+}
+
+auto gs_launch(std::size_t n, std::size_t local = 512)
+{
+    std::size_t groups = 1 + n / local;
+    std::size_t nglobal = std::min<std::size_t>(512, groups) * local;
+
+    return [=](auto f) {
+        launch(nglobal, local)([=](auto idx) {
+            for(size_t i = idx.global; i < n; i += nglobal)
+            {
+                f(i);
+            }
+        });
+    };
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace migraph
+
+#endif