Commit e9143cd7 authored by Paul's avatar Paul
Browse files

Remove unused code

parent e4f5508c
...@@ -32,214 +32,6 @@ auto nary_nonstandard_impl(hipStream_t stream, F f, argument result, Arguments.. ...@@ -32,214 +32,6 @@ auto nary_nonstandard_impl(hipStream_t stream, F f, argument result, Arguments..
}); });
} }
template <class F>
void trinary_broadcast_vec_impl(hipStream_t stream,
F f,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg3.get_shape();
auto bdim =
std::distance(b_shape.strides().begin(),
std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
return x != 0;
}));
auto bdim_len = output_shape.lens()[bdim];
auto bdim_stride = output_shape.strides()[bdim];
auto bdim_next_stride = bdim_stride * bdim_len;
visit_all(result, arg1, arg2, arg3)([&](auto output, auto input1, auto input2, auto input3) {
using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
auto* xp = as_vec<4>(device_cast(input1.data()));
auto* yp = as_vec<4>(device_cast(input2.data()));
auto* zp = as_vec<4>(device_cast(input3.data()));
auto* outp = as_vec<4>(device_cast(output.data()));
const std::size_t vec_size = 4;
const std::size_t nlocal = 1024;
const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size() / vec_size;
const std::size_t bdim_vec_len = bdim_len / vec_size;
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED vec<type, 4> buffer[2048 / vec_size];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
{
buffer[i] = zp[i];
}
__syncthreads();
auto* bp = as_pointer(buffer);
// Process the data
for(size_t i = idx.global; i < n; i += nglobal)
{
auto bidx = ((i * vec_size) % bdim_next_stride) / bdim_stride;
auto b = bp[bidx];
vec<type, 4> x = xp[i];
vec<type, 4> y = yp[i];
vec<type, 4> out = outp[i];
for(std::size_t j = 0; j < vec_size; j++)
{
out[j] = f(x[j], y[j], b);
}
outp[i] = out;
}
});
});
}
template <class F>
void trinary_broadcast_impl(hipStream_t stream,
F f,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg3.get_shape();
auto bdim =
std::distance(b_shape.strides().begin(),
std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
return x != 0;
}));
auto bdim_len = output_shape.lens()[bdim];
auto bdim_stride = output_shape.strides()[bdim];
auto bdim_next_stride = bdim_stride * bdim_len;
visit_all(result, arg1, arg2, arg3)([&](auto output, auto input1, auto input2, auto input3) {
using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
auto* xp = device_cast(input1.data());
auto* yp = device_cast(input2.data());
auto* zp = device_cast(input3.data());
auto* outp = device_cast(output.data());
const std::size_t nlocal = 1024;
const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size();
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED type buffer[2048];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal)
{
buffer[i] = zp[i];
}
__syncthreads();
// Process the data
for(size_t i = idx.global; i < n; i += nglobal)
{
auto bidx = (i % bdim_next_stride) / bdim_stride;
auto b = buffer[bidx];
type x = xp[i];
type y = yp[i];
outp[i] = f(x, y, b);
}
});
});
}
template <class F>
void binary_broadcast_vec_impl(
hipStream_t stream, F f, const argument& result, const argument& arg1, const argument& arg2)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg2.get_shape();
auto bdim =
std::distance(b_shape.strides().begin(),
std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
return x != 0;
}));
auto bdim_len = output_shape.lens()[bdim];
auto bdim_stride = output_shape.strides()[bdim];
auto bdim_next_stride = bdim_stride * bdim_len;
visit_all(result, arg1, arg2)([&](auto output, auto input1, auto input2) {
using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
auto* xp = as_vec<4>(device_cast(input1.data()));
auto* yp = as_vec<4>(device_cast(input2.data()));
auto* outp = as_vec<4>(device_cast(output.data()));
const std::size_t vec_size = 4;
const std::size_t nlocal = 1024;
const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size() / vec_size;
const std::size_t bdim_vec_len = bdim_len / vec_size;
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED vec<type, 4> buffer[2048 / vec_size];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
{
buffer[i] = yp[i];
}
__syncthreads();
auto* bp = as_pointer(buffer);
// Process the data
for(size_t i = idx.global; i < n; i += nglobal)
{
auto bidx = ((i * vec_size) % bdim_next_stride) / bdim_stride;
auto b = bp[bidx];
vec<type, 4> x = xp[i];
vec<type, 4> out = outp[i];
for(std::size_t j = 0; j < vec_size; j++)
{
out[j] = f(x[j], b);
}
outp[i] = out;
}
});
});
}
template <class F>
void binary_broadcast_impl(
hipStream_t stream, F f, const argument& result, const argument& arg1, const argument& arg2)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg2.get_shape();
auto bdim =
std::distance(b_shape.strides().begin(),
std::find_if(b_shape.strides().begin(), b_shape.strides().end(), [](auto x) {
return x != 0;
}));
auto bdim_len = output_shape.lens()[bdim];
auto bdim_stride = output_shape.strides()[bdim];
auto bdim_next_stride = bdim_stride * bdim_len;
visit_all(result, arg1, arg2)([&](auto output, auto input1, auto input2) {
using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
auto* xp = device_cast(input1.data());
auto* yp = device_cast(input2.data());
auto* outp = device_cast(output.data());
const std::size_t nlocal = 1024;
const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size();
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPHX_DEVICE_SHARED type buffer[2048];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal)
{
buffer[i] = yp[i];
}
__syncthreads();
// Process the data
for(size_t i = idx.global; i < n; i += nglobal)
{
auto bidx = (i % bdim_next_stride) / bdim_stride;
auto b = buffer[bidx];
type x = xp[i];
outp[i] = f(x, b);
}
});
});
}
template <class F, class... Arguments> template <class F, class... Arguments>
void nary_broadcast_vec_impl( void nary_broadcast_vec_impl(
hipStream_t stream, F f, argument result, argument barg, Arguments... args) hipStream_t stream, F f, argument result, argument barg, Arguments... args)
...@@ -357,8 +149,8 @@ template <class F, class... Arguments> ...@@ -357,8 +149,8 @@ template <class F, class... Arguments>
void nary_standard_impl(hipStream_t stream, F f, argument result, Arguments... args) void nary_standard_impl(hipStream_t stream, F f, argument result, Arguments... args)
{ {
std::size_t nelements = result.get_shape().elements(); std::size_t nelements = result.get_shape().elements();
hip_visit_all(result, args...)([&](auto output, auto... inputs) { hip_pointer_visit_all(result, args...)([&](auto output, auto... inputs) {
gs_launch(stream, nelements)([=](auto i) { output.data()[i] = f(inputs.data()[i]...); }); gs_launch(stream, nelements)([=](auto i) { output[i] = f(inputs[i]...); });
}); });
} }
......
...@@ -284,6 +284,14 @@ auto hip_vec_visit_all(T&& x, Ts&&... xs) ...@@ -284,6 +284,14 @@ auto hip_vec_visit_all(T&& x, Ts&&... xs)
}; };
} }
template <class T, class... Ts>
auto hip_pointer_visit_all(T&& x, Ts&&... xs)
{
return [&](auto f) {
visit_all(x, xs...)([&](auto... vs) { f(device_cast(vs.data())...); });
};
}
template <std::size_t N, class T> template <std::size_t N, class T>
auto hip_visit_all(const std::vector<T>& x) auto hip_visit_all(const std::vector<T>& x)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment