Unverified Commit 56b3bf58 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Improve layernorm performance (#613)

* Use increment instead of division to compute register offset

* Formatting

* Limit layernorm to 1024 elements

* Formatting

* Add verification to driver

* Formatting

* Remove early return

* Use block_size 256

* Vectorize the kernel

* Formatting

* Convert to vector type

* Add layernorm tests

* Formatting

* Formatting

* Refactor layernorm to run both algos

* Formatting

* Fix compile error

* Fix tidy warnings

* Formatting

* Add layernorm function

* Formatting
parent 1ca3c133
......@@ -33,6 +33,7 @@ add_library(migraphx
simplify_algebra.cpp
simplify_reshapes.cpp
value.cpp
verify_args.cpp
json.cpp
opt/memory_coloring.cpp
opt/memory_coloring_impl.cpp
......
......@@ -115,11 +115,34 @@ struct loader
}
};
struct program_params
{
std::vector<std::string> fill0{};
std::vector<std::string> fill1{};
void parse(argument_parser& ap)
{
ap(fill0, {"--fill0"}, ap.help("Fill parameter with 0s"), ap.append());
ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append());
}
auto generate(const program& p, bool use_gpu)
{
program::parameter_map m;
for(auto&& s : fill0)
m[s] = fill_argument(p.get_parameter_shape(s), 0);
for(auto&& s : fill1)
m[s] = fill_argument(p.get_parameter_shape(s), 1);
fill_param_map(m, p, use_gpu);
return m;
}
};
struct compiler
{
static const int q_fp16 = 1;
static const int q_int8 = 2;
loader l;
program_params parameters;
bool gpu = true;
bool offload_copy = false;
int quantize = 0;
......@@ -129,6 +152,7 @@ struct compiler
void parse(argument_parser& ap)
{
l.parse(ap);
parameters.parse(ap);
ap(gpu, {"--gpu"}, ap.help("Compile on the gpu"), ap.set_value(true));
ap(gpu, {"--cpu"}, ap.help("Compile on the cpu"), ap.set_value(false));
ap(offload_copy,
......@@ -137,20 +161,11 @@ struct compiler
ap.set_value(true));
ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(q_fp16));
ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(q_int8));
ap(fill0, {"--fill0"}, ap.help("Fill parameter with 0s"), ap.append());
ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append());
}
auto params(const program& p, bool use_gpu = true)
{
bool gpu_flag = use_gpu && gpu && !offload_copy;
program::parameter_map m;
for(auto&& s : fill0)
m[s] = fill_argument(p.get_parameter_shape(s), 0);
for(auto&& s : fill1)
m[s] = fill_argument(p.get_parameter_shape(s), 1);
fill_param_map(m, p, gpu_flag);
return m;
return parameters.generate(p, use_gpu && gpu && !offload_copy);
}
program compile()
......@@ -228,6 +243,7 @@ struct params : command<params>
struct verify : command<verify>
{
loader l;
program_params parameters;
double tolerance = 80;
bool per_instruction = false;
bool reduce = false;
......@@ -235,6 +251,7 @@ struct verify : command<verify>
void parse(argument_parser& ap)
{
l.parse(ap);
parameters.parse(ap);
ap(offload_copy,
{"--enable-offload-copy"},
ap.help("Enable implicit offload copying"),
......@@ -255,17 +272,19 @@ struct verify : command<verify>
compile_options options;
options.offload_copy = offload_copy;
auto m = parameters.generate(p, false);
if(per_instruction)
{
verify_instructions(p, options, tolerance);
}
else if(reduce)
{
verify_reduced_program(p, options, tolerance);
verify_reduced_program(p, options, m, tolerance);
}
else
{
verify_program(l.file, p, options, tolerance);
verify_program(l.file, p, options, m, tolerance);
}
}
};
......
......@@ -11,13 +11,19 @@ namespace migraphx {
namespace driver {
inline namespace MIGRAPHX_INLINE_NS {
template <class T>
auto get_hash(const T& x)
{
return std::hash<T>{}(x);
}
program::parameter_map fill_param_map(program::parameter_map& m, const program& p, bool gpu)
{
for(auto&& x : p.get_parameter_shapes())
{
argument& arg = m[x.first];
if(arg.empty())
arg = generate_argument(x.second);
arg = generate_argument(x.second, get_hash(x.first));
#ifdef HAVE_GPU
if(gpu)
arg = gpu::to_gpu(arg);
......@@ -35,12 +41,12 @@ program::parameter_map create_param_map(const program& p, bool gpu)
{
#ifdef HAVE_GPU
if(gpu)
m[x.first] = gpu::to_gpu(generate_argument(x.second));
m[x.first] = gpu::to_gpu(generate_argument(x.second, get_hash(x.first)));
else
#else
(void)gpu;
#endif
m[x.first] = generate_argument(x.second);
m[x.first] = generate_argument(x.second, get_hash(x.first));
}
return m;
}
......
#include "verify.hpp"
#include "perf.hpp"
#include <migraphx/cpu/target.hpp>
#include <migraphx/generate.hpp>
......@@ -15,26 +16,16 @@ namespace migraphx {
namespace driver {
inline namespace MIGRAPHX_INLINE_NS {
template <class T>
auto get_hash(const T& x)
{
return std::hash<T>{}(x);
}
std::vector<argument> run_cpu(program p)
std::vector<argument> run_cpu(program p, const program::parameter_map& inputs)
{
p.compile(cpu::target{});
program::parameter_map m;
for(auto&& x : p.get_parameter_shapes())
{
m[x.first] = generate_argument(x.second, get_hash(x.first));
}
auto out = p.eval(m);
auto out = p.eval(inputs);
std::cout << p << std::endl;
return out;
}
std::vector<argument> run_gpu(program p, const compile_options& options)
std::vector<argument>
run_gpu(program p, const compile_options& options, const program::parameter_map& inputs)
{
#ifdef HAVE_GPU
p.compile(gpu::target{}, options);
......@@ -42,7 +33,7 @@ std::vector<argument> run_gpu(program p, const compile_options& options)
program::parameter_map m;
for(auto&& x : p.get_parameter_shapes())
{
auto arg = generate_argument(x.second, get_hash(x.first));
auto arg = inputs.count(x.first) == 0 ? generate_argument(x.second) : inputs.at(x.first);
m[x.first] = options.offload_copy ? arg : gpu::to_gpu(arg);
}
auto gpu_out = p.eval(m);
......@@ -56,6 +47,7 @@ std::vector<argument> run_gpu(program p, const compile_options& options)
#else
(void)p;
(void)options;
(void)inputs;
MIGRAPHX_THROW("Gpu unsupported!");
#endif
}
......@@ -63,10 +55,11 @@ std::vector<argument> run_gpu(program p, const compile_options& options)
void verify_program(const std::string& name,
const program& p,
compile_options options,
const program::parameter_map& inputs,
double tolerance)
{
auto x = run_cpu(p);
auto y = run_gpu(p, options);
auto x = run_cpu(p, inputs);
auto y = run_gpu(p, options, inputs);
std::size_t output_num = x.size();
for(std::size_t i = 0; i < output_num; ++i)
......@@ -105,7 +98,7 @@ void verify_instructions(const program& prog, compile_options options, double to
{
std::cout << "Verify: " << ins.name() << std::endl;
std::cout << p << std::endl;
verify_program(ins.name(), p, options, tolerance);
verify_program(ins.name(), p, options, create_param_map(p, false), tolerance);
}
catch(...)
{
......@@ -115,21 +108,28 @@ void verify_instructions(const program& prog, compile_options options, double to
}
}
void verify_reduced(program p, int n, compile_options options, double tolerance)
void verify_reduced(program p,
int n,
compile_options options,
const program::parameter_map& inputs,
double tolerance)
{
auto last = std::prev(p.end(), n + 1);
p.remove_instructions(last, p.end());
std::cout << "Verify: " << std::endl;
std::cout << p << std::endl;
verify_program(std::to_string(n), p, options, tolerance);
verify_program(std::to_string(n), p, options, inputs, tolerance);
}
void verify_reduced_program(const program& p, compile_options options, double tolerance)
void verify_reduced_program(const program& p,
compile_options options,
const program::parameter_map& inputs,
double tolerance)
{
auto n = std::distance(p.begin(), p.end());
for(std::size_t i = 0; i < n; i++)
{
verify_reduced(p, i, options, tolerance);
verify_reduced(p, i, options, inputs, tolerance);
}
}
......
......@@ -11,14 +11,16 @@ std::vector<argument> run_cpu(program p);
std::vector<argument> run_gpu(program p);
void verify_program(const std::string& name,
const program& p,
compile_options options = compile_options{},
double tolerance = 100);
compile_options options = compile_options{},
const program::parameter_map& inputs = {},
double tolerance = 100);
void verify_instructions(const program& prog,
compile_options options = compile_options{},
double tolerance = 80);
void verify_reduced_program(const program& p,
compile_options options = compile_options{},
double tolerance = 80);
compile_options options = compile_options{},
const program::parameter_map& inputs = {},
double tolerance = 80);
} // namespace MIGRAPHX_INLINE_NS
} // namespace driver
......
......@@ -8,81 +8,10 @@
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
inline bool verify_args(const std::string& name,
const argument& cpu_arg,
const argument& gpu_arg,
double tolerance = 80)
{
bool passed = true;
visit_all(cpu_arg, gpu_arg)([&](auto cpu, auto gpu) {
double error;
passed = verify_range(cpu, gpu, tolerance, &error);
if(not passed)
{
// TODO: Check for nans
std::cout << "FAILED: " << name << std::endl;
std::cout << "error: " << error << std::endl;
if(cpu.size() < 32)
std::cout << "cpu:" << cpu << std::endl;
if(gpu.size() < 32)
std::cout << "gpu:" << gpu << std::endl;
if(range_zero(cpu))
std::cout << "Cpu data is all zeros" << std::endl;
if(range_zero(gpu))
std::cout << "Gpu data is all zeros" << std::endl;
auto mxdiff = max_diff(cpu, gpu);
std::cout << "Max diff: " << mxdiff << std::endl;
auto idx = mismatch_idx(cpu, gpu, float_equal);
if(idx < range_distance(cpu))
{
std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
<< std::endl;
}
auto cpu_nan_idx = find_idx(cpu, not_finite);
if(cpu_nan_idx >= 0)
std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
<< cpu[cpu_nan_idx] << std::endl;
auto gpu_nan_idx = find_idx(gpu, not_finite);
if(gpu_nan_idx >= 0)
std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
<< gpu[gpu_nan_idx] << std::endl;
std::cout << std::endl;
}
else
{
if(range_zero(cpu))
std::cout << "Cpu data is all zeros" << std::endl;
if(range_zero(gpu))
std::cout << "Gpu data is all zeros" << std::endl;
// auto mxdiff = max_diff(cpu, gpu);
// std::cout << "Max diff: " << mxdiff << std::endl;
// auto idx = mismatch_idx(cpu, gpu, float_equal);
// if(idx < range_distance(cpu))
// {
// std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
// << std::endl;
// }
auto cpu_nan_idx = find_idx(cpu, not_finite);
if(cpu_nan_idx >= 0)
std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
<< cpu[cpu_nan_idx] << std::endl;
auto gpu_nan_idx = find_idx(gpu, not_finite);
if(gpu_nan_idx >= 0)
std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
<< gpu[gpu_nan_idx] << std::endl;
// std::cout << std::endl;
}
});
return passed;
}
bool verify_args(const std::string& name,
const argument& cpu_arg,
const argument& gpu_arg,
double tolerance = 80);
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
......@@ -8,57 +8,132 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
template <class T>
struct vector_type
{
};
template <class T, index_int N>
struct vector_type<vec<T, N>>
{
using type = T;
};
template <class T>
using vector_type_t = typename vector_type<T>::type;
// m = x - mean(x)
// m / sqrt(mean(m ^ 2) + 1e-12)
void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
template <index_int N>
void layernorm_vec_impl(hipStream_t stream,
const argument& result,
const argument& arg1,
index_int nelements,
index_int relements)
{
auto relements = arg1.get_shape().lens().back();
assert(relements <= 1024);
auto nelements = result.get_shape().elements() / relements;
auto input_shape = arg1.get_shape();
auto output_shape = result.get_shape();
auto reduce_output_lens(output_shape.lens());
reduce_output_lens.back() = 1;
hip_vec_visit_all<N>(result, arg1)([&](auto output, auto input) {
using value_type = typename decltype(input)::value_type;
const auto relements_v = relements / N;
const std::size_t max_block_size = 256;
const std::size_t block_size = compute_block_size(relements_v, max_block_size);
const std::size_t block_size_div = encode_divisor(block_size);
assert(relements_v <= block_size);
gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
const auto out_idx = fast_div(i, block_size_div);
const auto base_idx = out_idx * relements_v;
const auto input_idx = base_idx + idx.local;
const bool in_range = idx.local < relements_v;
auto mean = [&](auto z) {
auto psum = block_reduce<max_block_size>(
idx, sum{}, value_type(0), relements_v, [=](auto) { return z; });
vector_type_t<value_type> sum = 0;
for(index_int k = 0; k < N; k++)
sum += psum[k];
return sum / relements;
};
std::vector<index_int> reduce_lens = get_reduce_lens(input_shape.lens(), reduce_output_lens);
// m = x - mean(x)
value_type x = in_range ? input.data()[input_idx] : 0;
value_type m = x - mean(x);
// mean(m ^ 2) + 1e-12
value_type r = mean(m * m) + value_type(1e-12);
// rsqrt(mean(m ^ 2) + 1e-12)
value_type d = 0;
for(index_int k = 0; k < N; k++)
d[k] = ::rsqrt(r[k]);
// m * rsqrt(mean(m ^ 2) + 1e-12)
if(in_range)
output.data()[input_idx] = m * d;
});
});
}
void layernorm_impl(hipStream_t stream,
const argument& result,
const argument& arg1,
index_int nelements,
index_int relements)
{
hip_visit_all(result, arg1)([&](auto output, auto input) {
using value_type = typename decltype(input)::value_type;
const std::size_t max_block_size = 256;
const std::size_t block_size = compute_block_size(relements, max_block_size);
const std::size_t block_size_div = encode_divisor(block_size);
assert(relements <= block_size);
gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
const auto out_idx = i / block_size;
const auto base_idx = out_idx * relements;
value_type x_data[4];
auto x = [&](auto j) -> value_type& {
return x_data[fast_div(j - idx.local, block_size_div)];
};
idx.local_stride(relements,
[&](auto j) __device__ { x(j) = input.data()[base_idx + j]; });
const auto out_idx = fast_div(i, block_size_div);
const auto base_idx = out_idx * relements;
const auto input_idx = base_idx + idx.local;
const bool in_range = idx.local < relements;
auto m = block_reduce<max_block_size>(
idx, sum{}, 0, relements, [&](auto j) __device__ { return x(j); }) /
relements;
idx.local_stride(relements, [&](auto j) __device__ { x(j) = x(j) - m; });
auto mean = [&](auto z) {
return block_reduce<max_block_size>(idx,
sum{},
value_type(0),
relements,
[=](auto) { return in_range ? z : 0; }) /
relements;
};
auto r = block_reduce<max_block_size>(
idx, sum{}, 0, relements, [&](auto j) __device__ { return x(j) * x(j); }) /
relements;
// m = x - mean(x)
value_type x = in_range ? input.data()[input_idx] : 0;
value_type m = x - mean(x);
idx.local_stride(relements, [&](auto j) __device__ {
output.data()[base_idx + j] = x(j) * ::rsqrt(r + 1e-12);
});
// mean(m ^ 2) + 1e-12
value_type r = mean(m * m) + 1e-12;
// m * rsqrt(mean(m ^ 2) + 1e-12)
if(in_range)
output.data()[input_idx] = m * ::rsqrt(r);
});
});
}
void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
{
auto relements = arg1.get_shape().lens().back();
auto nelements = result.get_shape().elements() / relements;
auto output_shape = result.get_shape();
auto reduce_output_lens(output_shape.lens());
reduce_output_lens.back() = 1;
if((relements % 4) == 0)
layernorm_vec_impl<4>(stream, result, arg1, nelements, relements);
else if(relements < 256)
layernorm_impl(stream, result, arg1, nelements, relements);
else
MIGRAPHX_THROW("No kernel for layernorm");
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -233,6 +233,8 @@ MIGRAPHX_REGISTER_OP(hip_add_tanh)
struct hip_layernorm : unary_device<hip_layernorm, &device::layernorm>
{
// Empty finalize to skip dimension reduction
void finalize(context&, const shape&, const std::vector<shape>&) {}
};
MIGRAPHX_REGISTER_OP(hip_layernorm)
......@@ -326,6 +328,15 @@ struct find_layernorm
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
// We dont fuse for non-standard layouts
if(not x_ins->get_shape().standard())
return;
auto relements = x_ins->get_shape().lens().back();
if(relements > 1024 or (relements % 4 != 0 and relements > 256))
return;
p.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
}
};
......
#include <migraphx/verify_args.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
bool verify_args(const std::string& name,
const argument& cpu_arg,
const argument& gpu_arg,
double tolerance)
{
bool passed = true;
visit_all(cpu_arg, gpu_arg)([&](auto cpu, auto gpu) {
double error;
passed = verify_range(cpu, gpu, tolerance, &error);
if(not passed)
{
// TODO: Check for nans
std::cout << "FAILED: " << name << std::endl;
std::cout << "error: " << error << std::endl;
if(cpu.size() < 32)
std::cout << "cpu:" << cpu << std::endl;
if(gpu.size() < 32)
std::cout << "gpu:" << gpu << std::endl;
if(range_zero(cpu))
std::cout << "Cpu data is all zeros" << std::endl;
if(range_zero(gpu))
std::cout << "Gpu data is all zeros" << std::endl;
auto mxdiff = max_diff(cpu, gpu);
std::cout << "Max diff: " << mxdiff << std::endl;
auto idx = mismatch_idx(cpu, gpu, float_equal);
if(idx < range_distance(cpu))
{
std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
<< std::endl;
}
auto cpu_nan_idx = find_idx(cpu, not_finite);
if(cpu_nan_idx >= 0)
std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
<< cpu[cpu_nan_idx] << std::endl;
auto gpu_nan_idx = find_idx(gpu, not_finite);
if(gpu_nan_idx >= 0)
std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
<< gpu[gpu_nan_idx] << std::endl;
std::cout << std::endl;
}
else
{
if(range_zero(cpu))
std::cout << "Cpu data is all zeros" << std::endl;
if(range_zero(gpu))
std::cout << "Gpu data is all zeros" << std::endl;
// auto mxdiff = max_diff(cpu, gpu);
// std::cout << "Max diff: " << mxdiff << std::endl;
// auto idx = mismatch_idx(cpu, gpu, float_equal);
// if(idx < range_distance(cpu))
// {
// std::cout << "Mismatch at " << idx << ": " << cpu[idx] << " != " << gpu[idx]
// << std::endl;
// }
auto cpu_nan_idx = find_idx(cpu, not_finite);
if(cpu_nan_idx >= 0)
std::cout << "Non finite number found in cpu at " << cpu_nan_idx << ": "
<< cpu[cpu_nan_idx] << std::endl;
auto gpu_nan_idx = find_idx(gpu, not_finite);
if(gpu_nan_idx >= 0)
std::cout << "Non finite number found in gpu at " << gpu_nan_idx << ": "
<< gpu[gpu_nan_idx] << std::endl;
// std::cout << std::endl;
}
});
return passed;
}
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -1028,33 +1028,50 @@ struct test_triadd_tanh : verify_program<test_triadd_tanh>
}
};
migraphx::instruction_ref add_layernorm(migraphx::program& p, std::vector<size_t> dims)
{
auto x = p.add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
auto scale =
p.add_parameter("scale", migraphx::shape{migraphx::shape::float_type, {dims.back()}});
auto bias =
p.add_parameter("bias", migraphx::shape{migraphx::shape::float_type, {dims.back()}});
auto epsilon = p.add_literal(1e-12f);
auto exponent = p.add_literal(2.0f);
auto mean = p.add_instruction(migraphx::op::reduce_mean({2}), x);
auto mean_mbcast = p.add_instruction(migraphx::op::multibroadcast{{dims}}, mean);
auto sub = p.add_instruction(migraphx::op::sub{}, x, mean_mbcast);
auto exponent_mbcast = p.add_instruction(migraphx::op::multibroadcast{{dims}}, exponent);
auto pow = p.add_instruction(migraphx::op::pow{}, sub, exponent_mbcast);
auto var = p.add_instruction(migraphx::op::reduce_mean({2}), pow);
auto epsilon_mbcast =
p.add_instruction(migraphx::op::multibroadcast{{1, dims.at(1), 1}}, epsilon);
auto add_epsilon = p.add_instruction(migraphx::op::add{}, var, epsilon_mbcast);
auto sqrt = p.add_instruction(migraphx::op::sqrt{}, add_epsilon);
auto sqrt_mbcast = p.add_instruction(migraphx::op::multibroadcast{dims}, sqrt);
auto div = p.add_instruction(migraphx::op::div{}, sub, sqrt_mbcast);
auto scale_mbcast = p.add_instruction(migraphx::op::multibroadcast{dims}, scale);
auto mul = p.add_instruction(migraphx::op::mul{}, scale_mbcast, div);
auto bias_mbcast = p.add_instruction(migraphx::op::multibroadcast{dims}, bias);
return p.add_instruction(migraphx::op::add{}, mul, bias_mbcast);
}
struct test_layernorm : verify_program<test_layernorm>
{
migraphx::program create_program() const
{
migraphx::program p;
std::vector<size_t> dims{1, 1, 5};
auto x = p.add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
auto scale = p.add_parameter("scale", migraphx::shape{migraphx::shape::float_type, {5}});
auto bias = p.add_parameter("bias", migraphx::shape{migraphx::shape::float_type, {5}});
auto epsilon = p.add_literal(1e-12f);
auto exponent = p.add_literal(migraphx::literal{
migraphx::shape{migraphx::shape::float_type, {1, 1, 5}}, {2, 2, 2, 2, 2}});
auto mean = p.add_instruction(migraphx::op::reduce_mean({2}), x);
auto mean_mbcast = p.add_instruction(migraphx::op::multibroadcast{{dims}}, mean);
auto sub = p.add_instruction(migraphx::op::sub{}, x, mean_mbcast);
auto pow = p.add_instruction(migraphx::op::pow{}, sub, exponent);
auto var = p.add_instruction(migraphx::op::reduce_mean({2}), pow);
auto epsilon_mbcast = p.add_instruction(migraphx::op::multibroadcast{{1, 1, 1}}, epsilon);
auto add_epsilon = p.add_instruction(migraphx::op::add{}, var, epsilon_mbcast);
auto sqrt = p.add_instruction(migraphx::op::sqrt{}, add_epsilon);
auto sqrt_mbcast = p.add_instruction(migraphx::op::multibroadcast{dims}, sqrt);
auto div = p.add_instruction(migraphx::op::div{}, sub, sqrt_mbcast);
auto scale_mbcast = p.add_instruction(migraphx::op::multibroadcast{dims}, scale);
auto mul = p.add_instruction(migraphx::op::mul{}, scale_mbcast, div);
auto bias_mbcast = p.add_instruction(migraphx::op::multibroadcast{dims}, bias);
p.add_instruction(migraphx::op::add{}, mul, bias_mbcast);
add_layernorm(p, {1, 1, 5});
return p;
}
};
struct test_layernorm2 : verify_program<test_layernorm2>
{
migraphx::program create_program() const
{
migraphx::program p;
add_layernorm(p, {1, 4, 24});
return p;
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment