Merge branch 'attn-bwd-dropout' into attn-fwd-train-dropout

cb2d4dbb · ltqin · 989e3d10 · 0e7aeef5 · cb2d4dbb · cb2d4dbb
Commit cb2d4dbb authored Feb 10, 2023 by ltqin
9 changed files
--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
@@ -100,6 +100,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
    return r;
 }
+template <typename... Xs, index_t N>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Number<N>& y)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+    // Tuple<Xs...> r;
+    // static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y; });
+    // return r;
+    return generate_tuple([&](auto i) { return x[i] * y; }, Number<NSize>{});
+}
 // MultiIndex = scalar * MultiIndex
 template <typename... Xs,
          typename Y,

--- a/include/ck/utility/thread_group.hpp
+++ b/include/ck/utility/thread_group.hpp
@@ -19,4 +19,37 @@ struct ThisThreadBlock
    __device__ static index_t GetThreadId() { return get_thread_local_1d_id(); }
 };
+template <index_t ThreadPerBlock>
+struct SubThreadBlock
+{
+    static constexpr index_t kNumThread_ = ThreadPerBlock;
+    __device__ SubThreadBlock(int mwave, int nwave) : mwave_(mwave), nwave_(nwave) {}
+    __device__ static constexpr index_t GetNumOfThread() { return kNumThread_; }
+    template <typename TupleArg1, typename TupleArg2>
+    __device__ constexpr bool IsBelong(const TupleArg1& mwave_range, const TupleArg2& nwave_range)
+    {
+        // wave_range[I0] inclusive, wave_range[I1] exclusive
+        if(mwave_ < mwave_range[I0])
+            return false;
+        else if(mwave_ >= mwave_range[I1])
+            return false;
+        else if(nwave_ < nwave_range[I0])
+            return false;
+        else if(nwave_ >= nwave_range[I1])
+            return false;
+        else
+            return true;
+    }
+    __device__ static index_t GetThreadId() { return get_thread_local_1d_id(); }
+    private:
+    index_t mwave_, nwave_;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+};
 } // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_dropout.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_dropout.hpp
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -58,3 +58,4 @@ add_subdirectory(batchnorm)
 if(GPU_TARGETS MATCHES "gfx1100")
    add_subdirectory(wmma_op)
 endif()
+add_subdirectory(host_tensor)
--- a/test/host_tensor/CMakeLists.txt
+++ b/test/host_tensor/CMakeLists.txt
+add_gtest_executable(test_host_tensor test_host_tensor.cpp)
+target_link_libraries(test_host_tensor PRIVATE utility)
\ No newline at end of file
--- a/test/host_tensor/test_host_tensor.cpp
+++ b/test/host_tensor/test_host_tensor.cpp
--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
--- a/test/softmax/test_softmax_host_ref.cpp
+++ b/test/softmax/test_softmax_host_ref.cpp