Commit 25d7fde8 authored by gaoqiong's avatar gaoqiong
Browse files

lite

parent 8439d29f
......@@ -440,7 +440,7 @@ Status call_reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* outpu
HIP_RETURN_IF_ERROR(hipMemsetAsync(output, 0, n * sizeof(TOut), stream));
}
constexpr int max_num_threads_in_block = 512;
constexpr int max_num_threads_in_block = 256;
constexpr int max_num_blocks_in_grid = 512;
constexpr int load_count_per_thread = 4;
......
......@@ -61,6 +61,8 @@ class ReduceKernel : public RocmKernel, public ReduceKernelBase<allow_multi_axes
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
template <typename T, typename OutT, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernelShared(
const T* X,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment