Commit 277ae59c authored by Shucai Xiao's avatar Shucai Xiao
Browse files

Merge branch 'int8_quantize' into op_capture

parents 43d39b4e 4cf9bd01
...@@ -22,8 +22,8 @@ void convert(hipStream_t stream, ...@@ -22,8 +22,8 @@ void convert(hipStream_t stream,
gs_launch(stream, result.get_shape().elements())([=](auto i) { gs_launch(stream, result.get_shape().elements())([=](auto i) {
float res = input_ptr[i] * scale + shift; float res = input_ptr[i] * scale + shift;
int factor = (res > 0) ? 1 : -1; int factor = (res > 0) ? 1 : -1;
output_ptr[i] = output_ptr[i] = static_cast<int8_t>(
std::min<int8_t>(std::max<float>(-128, res + factor * 0.5), 127); std::min<float>(std::max<float>(-128.0f, res + factor * 0.5), 127.0f));
}); });
} }
else else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment