Commit 78be5944 authored by Paul's avatar Paul
Browse files

Imprve indexing and syncthreads

parent ecfb0b72
...@@ -43,6 +43,7 @@ inline namespace MIGRAPHX_INLINE_NS { ...@@ -43,6 +43,7 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG_SYM);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
...@@ -227,6 +228,8 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std ...@@ -227,6 +228,8 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
if(params.find("-std=") == std::string::npos) if(params.find("-std=") == std::string::npos)
params += " --std=c++17"; params += " --std=c++17";
params += " -fno-gpu-rdc"; params += " -fno-gpu-rdc";
if(enabled(MIGRAPHX_GPU_DEBUG_SYM{}))
params += " -g";
params += " -c"; params += " -c";
if(is_hcc_compiler()) if(is_hcc_compiler())
{ {
......
...@@ -76,6 +76,7 @@ namespace migraphx { ...@@ -76,6 +76,7 @@ namespace migraphx {
template <class T, index_int N> template <class T, index_int N>
struct array struct array
{ {
using type = T;
T d[N]; T d[N];
constexpr T& operator[](index_int i) constexpr T& operator[](index_int i)
{ {
...@@ -208,6 +209,15 @@ struct integral_const_array : array<T, sizeof...(Xs)> ...@@ -208,6 +209,15 @@ struct integral_const_array : array<T, sizeof...(Xs)>
MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({Xs...}) {} MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({Xs...}) {}
}; };
template<class F>
constexpr auto return_const_array(F f)
{
constexpr const auto a = f();
return sequence(a.size(), [=](auto... is) {
return integral_const_array<typename decltype(a)::type, a[is]...>{};
});
}
template <class T, T... Xs, class F> template <class T, T... Xs, class F>
constexpr auto transform(integral_const_array<T, Xs...>, F f) constexpr auto transform(integral_const_array<T, Xs...>, F f)
{ {
......
...@@ -111,6 +111,12 @@ struct gens<1> : seq<0> ...@@ -111,6 +111,12 @@ struct gens<1> : seq<0>
{ {
}; };
template <class F, index_int... Ns>
constexpr void repeat_c_impl(F f, seq<Ns...>)
{
swallow{(f(std::integral_constant<std::size_t, Ns>{}), 0)...};
}
template <class F, index_int... Ns> template <class F, index_int... Ns>
constexpr auto sequence_c_impl(F&& f, seq<Ns...>) constexpr auto sequence_c_impl(F&& f, seq<Ns...>)
{ {
...@@ -143,6 +149,18 @@ constexpr auto sequence(IntegerConstant ic, F&& f) ...@@ -143,6 +149,18 @@ constexpr auto sequence(IntegerConstant ic, F&& f)
return sequence_c<ic>(f); return sequence_c<ic>(f);
} }
template <std::size_t N, class F>
constexpr void repeat_c(F f)
{
detail::repeat_c_impl(f, detail::gens<N>{});
}
template <class IntegerConstant, class F>
constexpr auto repeat(IntegerConstant ic, F&& f)
{
return repeat_c<ic>(f);
}
template <class F, class G> template <class F, class G>
constexpr auto by(F f, G g) constexpr auto by(F f, G g)
{ {
......
...@@ -54,8 +54,8 @@ struct index ...@@ -54,8 +54,8 @@ struct index
} }
#endif #endif
template <class F> template <class N, class F>
__device__ void global_stride(index_int n, F f) const __device__ void global_stride(N n, F f) const
{ {
const auto stride = nglobal(); const auto stride = nglobal();
for(index_int i = global; i < n; i += stride) for(index_int i = global; i < n; i += stride)
...@@ -64,8 +64,8 @@ struct index ...@@ -64,8 +64,8 @@ struct index
} }
} }
template <class F> template <class N, class F>
__device__ void local_stride(index_int n, F f) const __device__ void local_stride(N n, F f) const
{ {
const auto stride = nlocal(); const auto stride = nlocal();
for(index_int i = local; i < n; i += stride) for(index_int i = local; i < n; i += stride)
......
...@@ -186,7 +186,8 @@ __device__ auto auto_preload(index idx) ...@@ -186,7 +186,8 @@ __device__ auto auto_preload(index idx)
{ {
return make_transform([=](auto f, auto... xs) { return make_transform([=](auto f, auto... xs) {
auto invoke = [=](auto... ys) { auto invoke = [=](auto... ys) {
__syncthreads(); if constexpr((Bs or ...))
__syncthreads();
f(ys...); f(ys...);
}; };
join(invoke, preload_copy<Bs>(idx, xs)...); join(invoke, preload_copy<Bs>(idx, xs)...);
......
...@@ -74,6 +74,20 @@ struct shape ...@@ -74,6 +74,20 @@ struct shape
constexpr auto standard() const { return packed() and not transposed(); } constexpr auto standard() const { return packed() and not transposed(); }
static constexpr auto compute_standard_strides()
{
return return_const_array([] {
index_array result{};
index_int s = 1;
for(diff_int is = result.size() - 1; is >= 0; is--)
{
result[is] = s;
s *= Lens{}[is];
}
return result;
});
}
constexpr index_int index(index_array x) const { return x.dot(strides); } constexpr index_int index(index_array x) const { return x.dot(strides); }
constexpr index_int index(std::initializer_list<index_int> x) const constexpr index_int index(std::initializer_list<index_int> x) const
...@@ -86,8 +100,7 @@ struct shape ...@@ -86,8 +100,7 @@ struct shape
constexpr index_int index(index_int i) const constexpr index_int index(index_int i) const
{ {
if(this->standard()) if constexpr(shape{}.standard())
{
MIGRAPHX_ASSERT(i == compute_index(i)); MIGRAPHX_ASSERT(i == compute_index(i));
return i; return i;
} }
...@@ -99,19 +112,21 @@ struct shape ...@@ -99,19 +112,21 @@ struct shape
constexpr index_int compute_index(index_int i) const constexpr index_int compute_index(index_int i) const
{ {
const auto rank = this->lens.size(); constexpr auto sstrides = compute_standard_strides();
index_int s = 1; constexpr auto rank = Lens{}.size();
index_int result = 0; index_int result = 0;
for(index_int j = 0; j < rank; j++) repeat(rank, [&](auto j) {
{ constexpr auto k = rank - j - 1;
const index_int k = rank - j - 1; constexpr auto stride = Strides{}[k];
const index_int stride = this->strides[k]; constexpr auto len = Lens{}[k];
const index_int len = this->lens[k]; if constexpr(stride != 0)
const index_int slen = s * len; {
const index_int idx = (i % slen) / s; constexpr auto s = sstrides[k];
result += stride * idx; constexpr auto slen = s * len;
s = slen; auto idx = (i % slen) / s;
} result += stride * idx;
}
});
return result; return result;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment