Commit 3c95b34d authored by Shucai Xiao's avatar Shucai Xiao
Browse files

Merge branch 'layernorm_half2' into branch_for_ort2

parents af110526 789f86fb
......@@ -58,17 +58,20 @@ void module_pass_manager_apply(const T& x, module_pass_manager& mpm)
} // namespace detail
/*
* Type-erased interface for:
*
* struct pass
* {
* std::string name() const;
* void apply(module_pass_manager & mpm) const;
* void apply(program & p) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct pass
{
//
std::string name() const;
// (optional)
void apply(module_pass_manager& mpm) const;
// (optional)
void apply(program& p) const;
};
#else
struct pass
{
......@@ -303,6 +306,7 @@ inline const ValueType& any_cast(const pass& x)
throw std::bad_cast();
return *y;
}
#endif
#endif
......
......@@ -26,30 +26,35 @@ struct schedule_model
/// Get the number of concurrent instruction allowed
std::size_t concurrency() const;
/// Schedule a concurrent instruction
void sched(module& p, instruction_ref ins, std::size_t n) const;
void sched(module& m, instruction_ref ins, std::size_t n) const;
// Insert necessary waits before an instruction
void wait(module& p, instruction_ref ins, std::size_t wait_id) const;
void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
// Insert necessary records after an instruction
void record(module& p, instruction_ref ins, std::size_t wait_id) const;
void record(module& m, instruction_ref ins, std::size_t wait_id) const;
/// Compute weights for an operation
std::size_t weight(const operation& op) const;
};
#else
/*
* Type-erased interface for:
*
* struct schedule_model
* {
* std::size_t concurrency() const;
* void sched(module& p,instruction_ref ins,std::size_t n) const;
* void wait(module& p,instruction_ref ins,std::size_t wait_id) const;
* void record(module& p,instruction_ref ins,std::size_t wait_id) const;
* std::size_t weight(const operation& op) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct schedule_model
{
//
std::size_t concurrency() const;
//
void sched(module& m, instruction_ref ins, std::size_t n) const;
//
void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
//
void record(module& m, instruction_ref ins, std::size_t wait_id) const;
//
std::size_t weight(const operation& op) const;
};
#else
struct schedule_model
{
......@@ -120,22 +125,22 @@ struct schedule_model
return (*this).private_detail_te_get_handle().concurrency();
}
void sched(module& p, instruction_ref ins, std::size_t n) const
void sched(module& m, instruction_ref ins, std::size_t n) const
{
assert((*this).private_detail_te_handle_mem_var);
(*this).private_detail_te_get_handle().sched(p, ins, n);
(*this).private_detail_te_get_handle().sched(m, ins, n);
}
void wait(module& p, instruction_ref ins, std::size_t wait_id) const
void wait(module& m, instruction_ref ins, std::size_t wait_id) const
{
assert((*this).private_detail_te_handle_mem_var);
(*this).private_detail_te_get_handle().wait(p, ins, wait_id);
(*this).private_detail_te_get_handle().wait(m, ins, wait_id);
}
void record(module& p, instruction_ref ins, std::size_t wait_id) const
void record(module& m, instruction_ref ins, std::size_t wait_id) const
{
assert((*this).private_detail_te_handle_mem_var);
(*this).private_detail_te_get_handle().record(p, ins, wait_id);
(*this).private_detail_te_get_handle().record(m, ins, wait_id);
}
std::size_t weight(const operation& op) const
......@@ -159,9 +164,9 @@ struct schedule_model
virtual const std::type_info& type() const = 0;
virtual std::size_t concurrency() const = 0;
virtual void sched(module& p, instruction_ref ins, std::size_t n) const = 0;
virtual void wait(module& p, instruction_ref ins, std::size_t wait_id) const = 0;
virtual void record(module& p, instruction_ref ins, std::size_t wait_id) const = 0;
virtual void sched(module& m, instruction_ref ins, std::size_t n) const = 0;
virtual void wait(module& m, instruction_ref ins, std::size_t wait_id) const = 0;
virtual void record(module& m, instruction_ref ins, std::size_t wait_id) const = 0;
virtual std::size_t weight(const operation& op) const = 0;
};
......@@ -195,22 +200,22 @@ struct schedule_model
std::size_t concurrency() const override { return private_detail_te_value.concurrency(); }
void sched(module& p, instruction_ref ins, std::size_t n) const override
void sched(module& m, instruction_ref ins, std::size_t n) const override
{
private_detail_te_value.sched(p, ins, n);
private_detail_te_value.sched(m, ins, n);
}
void wait(module& p, instruction_ref ins, std::size_t wait_id) const override
void wait(module& m, instruction_ref ins, std::size_t wait_id) const override
{
private_detail_te_value.wait(p, ins, wait_id);
private_detail_te_value.wait(m, ins, wait_id);
}
void record(module& p, instruction_ref ins, std::size_t wait_id) const override
void record(module& m, instruction_ref ins, std::size_t wait_id) const override
{
private_detail_te_value.record(p, ins, wait_id);
private_detail_te_value.record(m, ins, wait_id);
}
std::size_t weight(const operation& op) const override
......@@ -283,6 +288,7 @@ inline const ValueType& any_cast(const schedule_model& x)
throw std::bad_cast();
return *y;
}
#endif
#endif
......
......@@ -36,20 +36,26 @@ struct stream_model
#else
/*
* Type-erased interface for:
*
* struct stream_model
* {
* std::size_t get_nstream() const;
* std::size_t get_stream(instruction_ref ins) const;
* std::size_t get_event_id(instruction_ref ins) const;
* bool has_stream(instruction_ref ins) const;
* bool is_record(instruction_ref ins) const;
* bool is_wait(instruction_ref ins) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct stream_model
{
//
std::size_t get_nstream() const;
//
std::size_t get_stream(instruction_ref ins) const;
//
std::size_t get_event_id(instruction_ref ins) const;
//
bool has_stream(instruction_ref ins) const;
//
bool is_record(instruction_ref ins) const;
//
bool is_wait(instruction_ref ins) const;
};
#else
struct stream_model
{
......@@ -296,6 +302,7 @@ inline const ValueType& any_cast(const stream_model& x)
throw std::bad_cast();
return *y;
}
#endif
#endif
......
......@@ -82,20 +82,26 @@ argument copy_from_target(T&, const argument& arg)
return arg;
}
/*
* Type-erased interface for:
*
* struct target
* {
* std::string name() const;
* std::vector<pass> get_passes(context& ctx,const compile_options& options) const;
* context get_context() const;
* argument copy_to(const argument& input) const;
* argument copy_from(const argument& input) const;
* argument allocate(const shape& s) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct target
{
//
std::string name() const;
//
std::vector<pass> get_passes(context& ctx, const compile_options& options) const;
//
context get_context() const;
// (optional)
argument copy_to(const argument& input) const;
// (optional)
argument copy_from(const argument& input) const;
// (optional)
argument allocate(const shape& s) const;
};
#else
struct target
{
......@@ -382,6 +388,7 @@ inline const ValueType& any_cast(const target& x)
throw std::bad_cast();
return *y;
}
#endif
#endif
......
......@@ -178,6 +178,7 @@ struct value
value(std::nullptr_t);
value(const char* i);
value(const std::string& pkey, const char* i);
#define MIGRAPHX_VALUE_GENERATE_DECL_METHODS(vt, cpp_type) \
value(cpp_type i); \
......@@ -188,6 +189,12 @@ struct value
const cpp_type* if_##vt() const;
MIGRAPHX_VISIT_VALUE_TYPES(MIGRAPHX_VALUE_GENERATE_DECL_METHODS)
template <class T>
using literal_to_string = std::conditional_t<(std::is_convertible<T, const char*>{} and
std::is_convertible<T, std::string>{}),
std::string,
T>;
template <class T>
using pick_numeric = std::conditional_t<
std::is_floating_point<T>{},
......@@ -246,6 +253,7 @@ struct value
return *this = from_values(rhs); // NOLINT
}
value& operator=(const char* c);
value& operator=(std::nullptr_t);
value& operator=(const std::initializer_list<value>& i);
......@@ -370,11 +378,11 @@ struct value
}
template <class To>
To value_or(const To& default_value) const
literal_to_string<To> value_or(const To& default_value) const
{
if(this->is_null())
return default_value;
return to<To>();
return to<literal_to_string<To>>();
}
template <class To>
......@@ -390,12 +398,12 @@ struct value
}
template <class To>
To get(const std::string& pkey, const To& default_value) const
literal_to_string<To> get(const std::string& pkey, const To& default_value) const
{
const auto* v = find(pkey);
if(v == this->end())
return default_value;
return v->to<To>();
return v->to<literal_to_string<To>>();
}
template <class To>
......@@ -408,10 +416,11 @@ struct value
}
template <class To>
std::vector<To> get(const std::string& pkey,
const std::initializer_list<To>& default_value) const
std::vector<literal_to_string<To>> get(const std::string& pkey,
const std::initializer_list<To>& default_value) const
{
return get<std::vector<To>>(pkey, default_value);
return get(pkey,
std::vector<literal_to_string<To>>{default_value.begin(), default_value.end()});
}
friend bool operator==(const value& x, const value& y);
......
......@@ -213,7 +213,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
py::class_<migraphx::shape>(m, "shape")
.def(py::init([](py::kwargs kwargs) {
auto v = migraphx::to_value(kwargs);
auto t = migraphx::shape::parse_type(v.get("type", std::string{"float"}));
auto t = migraphx::shape::parse_type(v.get("type", "float"));
auto lens = v.get<std::size_t>("lens", {1});
if(v.contains("strides"))
return migraphx::shape(t, lens, v.at("strides").to_vector<std::size_t>());
......
#include <migraphx/register_target.hpp>
#include <unordered_map>
#include <migraphx/register_target.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -11,7 +11,17 @@ std::unordered_map<std::string, target>& target_map()
}
void register_target(const target& t) { target_map()[t.name()] = t; }
target make_target(const std::string& name) { return target_map().at(name); }
target make_target(const std::string& name)
{
const auto it = target_map().find(name);
if(it == target_map().end())
{
MIGRAPHX_THROW("Requested target '" + name + "' is not enabled or not supported");
}
return it->second;
}
std::vector<std::string> get_targets()
{
std::vector<std::string> result;
......
......@@ -119,6 +119,7 @@ target_link_libraries(kernel_file_check compile_for_gpu)
rocm_clang_tidy_check(kernel_file_check)
file(GLOB JIT_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
add_library(migraphx_gpu
abs.cpp
analyze_streams.cpp
......@@ -131,9 +132,7 @@ add_library(migraphx_gpu
compile_ops.cpp
compile_hip.cpp
compile_hip_code_object.cpp
compile_pointwise.cpp
compile_roialign.cpp
compile_scatternd.cpp
compiler.cpp
concat.cpp
convert.cpp
convolution.cpp
......@@ -171,6 +170,7 @@ add_library(migraphx_gpu
target.cpp
topk.cpp
write_literals.cpp
${JIT_GPU_SRCS}
)
set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
......@@ -331,6 +331,12 @@ target_compile_definitions(migraphx_gpu PRIVATE
"-DMIGRAPHX_EXTRACT_KERNEL=${MIGRAPHX_EXTRACT_KERNEL}"
"-DMIGRAPHX_USE_HIPRTC=0"
)
if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
endif()
endif()
# Check miopen find mode api
......
......@@ -178,6 +178,12 @@ bool is_hip_clang_compiler()
return result;
}
bool has_compiler_launcher()
{
static const auto result = fs::exists(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER));
return result;
}
std::vector<std::vector<char>>
compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch)
{
......@@ -210,6 +216,10 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
src_compiler compiler;
compiler.flags = params;
compiler.compiler = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER);
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
if(has_compiler_launcher())
compiler.launcher = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER);
#endif
if(is_hcc_compiler())
compiler.process = [&](const fs::path& obj_path) -> fs::path {
......@@ -238,14 +248,6 @@ std::string enum_params(std::size_t count, std::string param)
return join_strings(items, ",");
}
std::size_t compute_global(std::size_t n, std::size_t local)
{
std::size_t groups = (n + local - 1) / local;
// max possible number of blocks is set to 1B (1,073,741,824)
std::size_t nglobal = std::min<std::size_t>(1073741824, groups) * local;
return nglobal;
}
#endif // MIGRAPHX_USE_HIPRTC
} // namespace gpu
......
......@@ -93,6 +93,32 @@ const std::vector<std::string>& compiler_warnings()
return warnings;
}
void hip_compile_options::set_launch_params(
const value& v,
const std::function<std::size_t(std::size_t local)>& compute_global,
std::size_t default_local)
{
local = v.get("local", default_local);
if(v.contains("global"))
global = v.at("global").to<std::size_t>();
else
global = compute_global(local);
}
std::function<std::size_t(std::size_t local)>
compute_global_for(context& ctx, std::size_t n, std::size_t over)
{
assert(over > 0);
std::size_t max_global = ctx.get_current_device().get_cu_count() *
ctx.get_current_device().get_max_workitems_per_cu();
return [n, over, max_global](std::size_t local) {
std::size_t groups = (n + local - 1) / local;
std::size_t max_blocks = max_global / local;
std::size_t nglobal = std::min(max_blocks * over, groups) * local;
return nglobal;
};
}
operation compile_hip_code_object(const std::string& content, hip_compile_options options)
{
std::vector<src_file> srcs;
......
......@@ -6,7 +6,7 @@
#include <migraphx/par_for.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/op/identity.hpp>
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compiler.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -40,35 +40,9 @@ struct precompile_op
MIGRAPHX_REGISTER_OP(precompile_op);
struct pointwise_compiler
{
std::string name() const { return "pointwise"; }
operation apply(context& ctx, instruction_ref ins, const operation&) const
{
assert(not ins->module_inputs().empty());
auto* pm = ins->module_inputs().front();
return compile_pointwise(ctx, to_shapes(ins->inputs()), *pm);
}
};
using compiler_function = std::function<operation(context&, instruction_ref, operation)>;
template <class T>
compiler_function make_compiler_function(T x)
{
return {[=](auto&&... xs) { return x.apply(xs...); }};
}
template <class... Ts>
std::unordered_map<std::string, compiler_function> make_compilers(Ts... xs)
{
return {{xs.name(), make_compiler_function(xs)}...};
}
struct compiled_result
{
operation op;
compiler_replace replace;
instruction_ref ins;
};
......@@ -82,7 +56,6 @@ void par_compile(std::size_t n, F f)
void compile_ops::apply(module& m) const
{
auto compilers = make_compilers(pointwise_compiler{});
std::vector<std::function<compiled_result()>> compiles;
for(auto ins : iterator_for(m))
......@@ -90,15 +63,15 @@ void compile_ops::apply(module& m) const
if(ins->name() != "gpu::precompile_op")
continue;
operation preop = any_cast<precompile_op>(ins->get_operator()).op;
assert(contains(compilers, preop.name()));
auto c = compilers[preop.name()];
compiles.emplace_back([=]() -> compiled_result { return {c(*ctx, ins, preop), ins}; });
compiles.emplace_back([=]() -> compiled_result {
return {compile(*ctx, ins, preop), ins};
});
}
std::vector<compiled_result> results(compiles.size());
par_compile(compiles.size(), [&](auto i) { results[i] = compiles[i](); });
for(const auto& cr : results)
{
m.replace_instruction(cr.ins, cr.op, cr.ins->inputs());
cr.replace(m, cr.ins);
}
}
......
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/module.hpp>
#include <migraphx/pass_manager.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
static const char* const pointwise_kernel = R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/pointwise.hpp>
#include <args.hpp>
namespace migraphx {
${preamble}
extern "C" {
__global__ void kernel(${params})
{
pointwise(${lambda}, ${args});
}
}
} // namespace migraphx
int main() {}
)__migraphx__";
operation compile_pointwise(context&,
const std::vector<shape>& inputs,
const std::string& lambda,
const std::string& preamble)
{
hip_compile_options options;
options.global = compute_global(inputs.front().elements());
options.local = 1024;
options.inputs = inputs;
options.output = inputs.back();
options.virtual_inputs = reduce_dims(inputs);
options.params = "-Wno-float-equal";
auto src = interpolate_string(pointwise_kernel,
{{"params", enum_params(inputs.size(), "void * private_p")},
{"args", enum_params(inputs.size(), "private_p")},
{"lambda", lambda},
{"preamble", preamble}});
return compile_hip_code_object(src, options);
}
operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, module m)
{
run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
cpp_generator g;
g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
g.add_point_op("sign", "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
g.add_point_op("less", "migraphx::abs(${0} < ${1})");
g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
g.add_point_op("not", "migraphx::abs(not ${0})");
// Add explict conversions
g.fresult(
[](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
auto name =
g.create_function(g.generate_module(m).set_attributes({"__device__"}).set_generic_types(m));
return compile_pointwise((ctx), inputs, "MIGRAPHX_LIFT(" + name + ")", g.str());
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/compiler.hpp>
#include <utility>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
auto& compiler_map()
{
static std::unordered_map<std::string, compiler_compile> m; // NOLINT
return m;
}
auto& compiler_op_map()
{
static std::unordered_map<std::string, compiler_compile_op> m; // NOLINT
return m;
}
void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop)
{
compiler_map()[name] = std::move(c);
compiler_op_map()[name] = std::move(cop);
}
bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
compiler_replace compile(context& ctx, instruction_ref ins, const operation& op)
{
return compiler_map().at(op.name())(ctx, ins, op);
}
operation
compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
{
return compiler_op_map().at(name)(ctx, inputs, v);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -215,36 +215,22 @@ __device__ __half2 block_reduce_half2(
// m = x - mean(x)
// m / sqrt(mean(m ^ 2) + 1e-12)
__global__ void triadd_layernorm_kernel_half2(
void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
__device__ void layernorm_kernel_half2(__half2* in_data,
__half2* in_data_reduce,
__half2* out,
index_int batch_item_num,
index_int block_size,
float rbatch_num)
{
__half2* input1 = reinterpret_cast<__half2*>(in1);
__half2* input2 = reinterpret_cast<__half2*>(in2);
__half2* input3 = reinterpret_cast<__half2*>(in3);
__half2* output = reinterpret_cast<__half2*>(data_out);
auto rnum = __float2half2_rn(1.0f / batch_item_num);
batch_item_num /= 2;
auto rnum = __float2half2_rn(rbatch_num);
extern MIGRAPHX_DEVICE_SHARED __half2 buffer2[];
__half2* in_data_reduce = buffer2;
__half2* in_data = buffer2 + batch_item_num;
int start = blockIdx.x * batch_item_num;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
in_data[i] = __hadd2(__hadd2(input1[idx], input2[idx]), input3[idx]);
in_data_reduce[i] = in_data[i];
// in_data_reduce[i] = __hmul2(in_data[i], rnum);
}
auto m =
block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{});
m = __hmul2(m, rnum);
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
in_data[i] = __hsub2(in_data[i], m);
// in_data_reduce[i] = __hmul2(__hmul2(in_data[i], in_data[i]), rnum);
in_data[i] = __hsub2(in_data[i], m);
in_data_reduce[i] = __hmul2(in_data[i], in_data[i]);
}
......@@ -255,11 +241,36 @@ __global__ void triadd_layernorm_kernel_half2(
auto r = __hadd2(m, eps);
r = h2rsqrt(r);
int start = blockIdx.x * batch_item_num;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
out[idx] = __hmul2(in_data[i], r);
}
}
__global__ void triadd_layernorm_half2(
void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
{
__half2* input1 = reinterpret_cast<__half2*>(in1);
__half2* input2 = reinterpret_cast<__half2*>(in2);
__half2* input3 = reinterpret_cast<__half2*>(in3);
__half2* output = reinterpret_cast<__half2*>(data_out);
float rnum = 1.0f / batch_item_num;
batch_item_num /= 2;
extern MIGRAPHX_DEVICE_SHARED __half2 buffer2[];
__half2* in_data_reduce = buffer2;
__half2* in_data = buffer2 + batch_item_num;
int start = blockIdx.x * batch_item_num;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
output[idx] = __hmul2(in_data[i], r);
int idx = i + start;
in_data[i] = __hadd2(__hadd2(input1[idx], input2[idx]), input3[idx]);
in_data_reduce[i] = in_data[i];
}
layernorm_kernel_half2(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
}
template <class T>
......@@ -281,105 +292,59 @@ block_reduce_half(T* buffer, index_int batch_item_num, index_int tid, index_int
// m = x - mean(x)
// m / sqrt(mean(m ^ 2) + 1e-12)
__global__ void triadd_layernorm_kernel_half(
void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
__device__ void layernorm_kernel_half(__half* in_data,
__half* in_data_reduce,
__half* out,
index_int batch_item_num,
index_int block_size,
float rnum)
{
__half* input1 = reinterpret_cast<__half*>(in1);
__half* input2 = reinterpret_cast<__half*>(in2);
__half* input3 = reinterpret_cast<__half*>(in3);
__half* output = reinterpret_cast<__half*>(data_out);
extern MIGRAPHX_DEVICE_SHARED __half bufferh[];
__half* in_data_reduce = bufferh;
__half* in_data = bufferh + batch_item_num;
auto m = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size);
m *= rnum;
int start = blockIdx.x * batch_item_num;
auto rnum = 1.0f / batch_item_num;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
in_data[i] = __float2half(__half2float(input1[idx]) + __half2float(input2[idx]) +
__half2float(input3[idx]));
in_data_reduce[i] = __float2half(__half2float(in_data[i]) * __half2float(rnum));
in_data[i] = __float2half(__half2float(in_data[i]) - __half2float(m));
in_data_reduce[i] = __float2half(__half2float(in_data[i]) * __half2float(in_data[i]));
}
auto m = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size);
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
in_data[i] = __float2half(__half2float(in_data[i]) - __half2float(m));
in_data_reduce[i] =
__float2half(__half2float(in_data[i]) * __half2float(in_data[i]) * __half2float(rnum));
}
m = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size);
m *= rnum;
m += 1.0e-12f;
m = __float2half(
__half2float(block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size)) +
1.0e-12f);
auto r = __float2half(rsqrt(__half2float(m)));
int start = blockIdx.x * batch_item_num;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
output[idx] = __float2half(__half2float(in_data[i]) * __half2float(r));
}
}
template <class T>
__device__ T block_reduce(T* buffer, index_int batch_item_num, index_int tid, index_int block_size)
{
__syncthreads();
for(index_int s = block_size; s > 0; s >>= 1)
{
if(tid < s and tid + s < batch_item_num)
{
buffer[tid] = buffer[tid] + buffer[tid + s];
}
__syncthreads();
int idx = i + start;
out[idx] = __float2half(__half2float(in_data[i]) * __half2float(r));
}
return buffer[0];
}
// m = x - mean(x)
// m / sqrt(mean(m ^ 2) + 1e-12)
template <class T>
__global__ void triadd_layernorm_kernel(
__global__ void triadd_layernorm_half(
void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
{
T* input1 = reinterpret_cast<T*>(in1);
T* input2 = reinterpret_cast<T*>(in2);
T* input3 = reinterpret_cast<T*>(in3);
T* output = reinterpret_cast<T*>(data_out);
extern MIGRAPHX_DEVICE_SHARED T buffer[];
T* in_data_reduce = buffer;
T* in_data = buffer + batch_item_num;
__half* input1 = reinterpret_cast<__half*>(in1);
__half* input2 = reinterpret_cast<__half*>(in2);
__half* input3 = reinterpret_cast<__half*>(in3);
__half* output = reinterpret_cast<__half*>(data_out);
float rnum = 1.0f / batch_item_num;
extern MIGRAPHX_DEVICE_SHARED __half bufferh[];
__half* in_data_reduce = bufferh;
__half* in_data = bufferh + batch_item_num;
int start = blockIdx.x * batch_item_num;
auto rnum = 1.0f / batch_item_num;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
in_data[i] = input1[idx] + input2[idx] + input3[idx];
in_data_reduce[i] = in_data[i];
// in_data_reduce[i] = __half2float(in_data[i]) * rnum;
}
auto m = block_reduce(in_data_reduce, batch_item_num, threadIdx.x, block_size);
m = m * rnum;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
in_data[i] = in_data[i] - m;
in_data_reduce[i] = in_data[i] * in_data[i];
// in_data_reduce[i] = __half2float(in_data[i] * in_data[i]) * rnum;
int idx = i + start;
in_data[i] = __float2half(__half2float(input1[idx]) + __half2float(input2[idx]) +
__half2float(input3[idx]));
}
m = block_reduce(in_data_reduce, batch_item_num, threadIdx.x, block_size);
m = m * rnum + 1.0e-12f;
auto r = rsqrt(m);
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
// output[idx] = __half2float(in_data[i]) * r;
output[idx] = in_data[i] * r;
}
layernorm_kernel_half(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
}
void triadd_layernorm(hipStream_t stream,
......@@ -393,12 +358,17 @@ void triadd_layernorm(hipStream_t stream,
auto batch_item_num = in_s.lens().back();
if(type == shape::half_type and (batch_item_num % 2) == 0)
{
auto half2_block_size = compute_block_size(batch_item_num, 1024);
auto block_size = compute_block_size(batch_item_num, 1024);
int block_num = in_s.elements() / batch_item_num;
int shared_size = batch_item_num * 2 * in_s.type_size();
half2_block_size = half2_block_size / 4;
triadd_layernorm_kernel_half2<<<block_num, half2_block_size, shared_size, stream>>>(
auto half2_block_size = block_size / 4;
triadd_layernorm_half2<<<block_num, half2_block_size, shared_size, stream>>>(
arg1.data(), arg2.data(), arg3.data(), result.data(), batch_item_num, half2_block_size);
// auto half_block_size = block_size / 2;
// triadd_layernorm_half2<<<block_num, half_block_size, shared_size, stream>>>(
// arg1.data(), arg2.data(), arg3.data(), result.data(), batch_item_num,
// half_block_size);
}
else
{
......@@ -409,11 +379,11 @@ void triadd_layernorm(hipStream_t stream,
}
__global__ void
layernorm_kernel_half2(void* in1, void* data_out, index_int batch_item_num, index_int block_size)
layernorm_half2(void* in1, void* data_out, index_int batch_item_num, index_int block_size)
{
__half2* input1 = reinterpret_cast<__half2*>(in1);
__half2* output = reinterpret_cast<__half2*>(data_out);
auto rnum = __float2half2_rn(1.0f / batch_item_num);
float rnum = 1.0f / batch_item_num;
batch_item_num /= 2;
extern MIGRAPHX_DEVICE_SHARED __half2 buffer2[];
__half2* in_data_reduce = buffer2;
......@@ -427,28 +397,28 @@ layernorm_kernel_half2(void* in1, void* data_out, index_int batch_item_num, inde
in_data_reduce[i] = in_data[i];
}
auto m =
block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{});
m = __hmul2(m, rnum);
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
in_data[i] = __hsub2(in_data[i], m);
in_data_reduce[i] = __hmul2(in_data[i], in_data[i]);
}
m = block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{});
m = __hmul2(m, rnum);
layernorm_kernel_half2(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
}
auto eps = __float2half2_rn(1.0e-12f);
auto r = __hadd2(m, eps);
r = h2rsqrt(r);
__global__ void
layernorm_half(void* in1, void* data_out, index_int batch_item_num, index_int block_size)
{
__half* input1 = reinterpret_cast<__half*>(in1);
__half* output = reinterpret_cast<__half*>(data_out);
float rnum = 1.0f / batch_item_num;
extern MIGRAPHX_DEVICE_SHARED __half buffer3[];
__half* in_data_reduce = buffer3;
__half* in_data = buffer3 + batch_item_num;
int start = blockIdx.x * batch_item_num;
for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{
int idx = i + start;
output[idx] = __hmul2(in_data[i], r);
int idx = i + start;
in_data[i] = input1[idx];
in_data_reduce[i] = in_data[i];
}
layernorm_kernel_half(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
}
void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
......@@ -458,12 +428,16 @@ void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
auto batch_item_num = in_s.lens().back();
if(type == shape::half_type and (batch_item_num % 2) == 0)
{
auto half2_block_size = compute_block_size(batch_item_num, 1024);
auto block_size = compute_block_size(batch_item_num, 1024);
int block_num = in_s.elements() / batch_item_num;
int shared_size = batch_item_num * 2 * in_s.type_size();
half2_block_size = half2_block_size / 4;
layernorm_kernel_half2<<<block_num, half2_block_size, shared_size, stream>>>(
auto half2_block_size = block_size / 4;
layernorm_half2<<<block_num, half2_block_size, shared_size, stream>>>(
arg1.data(), result.data(), batch_item_num, half2_block_size);
// auto half_block_size = block_size / 2;
// layernorm_half2<<<block_num, half_block_size, shared_size, stream>>>(
// arg1.data(), result.data(), batch_item_num, half_block_size);
}
else
{
......
file(GLOB GPU_DRIVER_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
add_executable(gpu-driver
action.cpp
compile_pointwise.cpp
main.cpp
parser.cpp
perf.cpp
run_op.cpp
${GPU_DRIVER_SRCS}
)
target_include_directories(gpu-driver PRIVATE include)
target_link_libraries(gpu-driver PRIVATE migraphx_gpu)
#include <migraphx/gpu/driver/action.hpp>
#include <migraphx/gpu/driver/perf.hpp>
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
......@@ -8,13 +8,13 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace driver {
struct compile_pointwise : action<compile_pointwise>
struct compile_op : action<compile_op>
{
static void apply(const parser& p, const value& v)
{
context ctx;
auto inputs = p.parse_shapes(v.at("inputs"));
auto op = gpu::compile_pointwise(ctx, inputs, v.at("lambda").to<std::string>());
auto op = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl;
}
......
......@@ -17,8 +17,6 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
std::string enum_params(std::size_t count, std::string param);
std::size_t compute_global(std::size_t n, std::size_t local = 1024);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
......@@ -8,6 +8,8 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct context;
struct hip_compile_options
{
std::size_t global;
......@@ -17,8 +19,24 @@ struct hip_compile_options
std::string kernel_name = "kernel";
std::string params = "";
std::vector<shape> virtual_inputs = {};
/**
* @brief Set the launch parameters but allow v to override the values
*
* @param v A value class which can have a "global" and/or "local" keys to override the default
* global and local
* @param compute_global A function used to compute the global based on the local
* @param default_local The defaul local to use if its missing from the v parameter
*/
void set_launch_params(const value& v,
const std::function<std::size_t(std::size_t local)>& compute_global,
std::size_t default_local = 1024);
};
/// Compute global for n elements, but max out on target-specific upper limit
std::function<std::size_t(std::size_t local)>
compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
operation compile_hip_code_object(const std::string& content, hip_compile_options options);
} // namespace gpu
......
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct module;
namespace gpu {
struct context;
operation compile_pointwise(context& ctx,
const std::vector<shape>& inputs,
const std::string& lambda,
const std::string& preamble = "");
operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, module m);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct context;
operation compile_roialign(context& ctx, const std::vector<shape>& io_shapes, const value& val);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment