Commit 992f57ba authored by Shucai Xiao's avatar Shucai Xiao
Browse files

backup changes

parent 4f07b8f1
...@@ -47,25 +47,25 @@ void auto_contiguous::apply(module& p) const ...@@ -47,25 +47,25 @@ void auto_contiguous::apply(module& p) const
} }
} }
// if ops used as output param are alias 0, add a contiguous for the output // // if ops used as output param are alias 0, add a contiguous for the output
// so return outputs with standard shape // // so return outputs with standard shape
if(last->name() == "@return") // if(last->name() == "@return")
{ // {
auto inputs = last->inputs(); // auto inputs = last->inputs();
for(auto ins : inputs) // for(auto ins : inputs)
{ // {
if(ins->name() == "contiguous") // if(ins->name() == "contiguous")
continue; // continue;
auto ins_alias = ins->get_operator().output_alias({}); // auto ins_alias = ins->get_operator().output_alias({});
if(ins_alias == 0 and ins->get_shape().element_space() != // if(ins_alias == 0 and ins->get_shape().element_space() !=
ins->inputs().front()->get_shape().element_space()) // ins->inputs().front()->get_shape().element_space())
{ // {
auto cont_ins = p.insert_instruction(last, make_op("contiguous"), ins); // auto cont_ins = p.insert_instruction(last, make_op("contiguous"), ins);
p.replace_instruction(ins, cont_ins); // p.replace_instruction(ins, cont_ins);
} // }
} // }
} // }
} }
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
...@@ -66,7 +66,7 @@ struct reduce_op : op_name<Derived> ...@@ -66,7 +66,7 @@ struct reduce_op : op_name<Derived>
{ {
value normalize; value normalize;
normalize["axes"] = value::array{normalize_attribute::include_min}; normalize["axes"] = value::array{normalize_attribute::include_min};
return {{"normalize_axes", normalize}, {"std_shape", true}}; return {{"normalize_axes", normalize}};
} }
std::vector<int64_t> tune_axes(std::size_t n_dim) const std::vector<int64_t> tune_axes(std::size_t n_dim) const
......
...@@ -120,17 +120,17 @@ struct find_nop_reshapes ...@@ -120,17 +120,17 @@ struct find_nop_reshapes
void apply(module& p, const match::matcher_result& mr) const void apply(module& p, const match::matcher_result& mr) const
{ {
auto ins = mr.result; auto ins = mr.result;
// output of reshape and contiguous is standard, so no need to add another contiguous // // output of reshape and contiguous is standard, so no need to add another contiguous
// if the output is used an a ret value // // if the output is used an a ret value
if(ins->name() == "contiguous" and ins->name() != "contiguous" and ins->name() != "reshape") // if(ins->name() == "contiguous" and ins->name() != "contiguous" and ins->name() != "reshape")
{ // {
auto& outputs = ins->outputs(); // auto& outputs = ins->outputs();
if(std::any_of( // if(std::any_of(
outputs.begin(), outputs.end(), [&](auto o) { return o->name() == "@return"; })) // outputs.begin(), outputs.end(), [&](auto o) { return o->name() == "@return"; }))
{ // {
return; // return;
} // }
} // }
p.replace_instruction(ins, ins->inputs().front()); p.replace_instruction(ins, ins->inputs().front());
} }
}; };
......
...@@ -36,7 +36,7 @@ struct half2_max ...@@ -36,7 +36,7 @@ struct half2_max
// in_data is in shared memory // in_data is in shared memory
template <class Op> template <class Op>
__device__ __half2 __device__ __half2
block_reduce(__half2* buffer, index_int batch_item_num, index_int tid, index_int block_size, Op op) block_reduce_half2(__half2* buffer, index_int batch_item_num, index_int tid, index_int block_size, Op op)
{ {
__syncthreads(); __syncthreads();
for(index_int s = block_size; s > 0; s >>= 1) for(index_int s = block_size; s > 0; s >>= 1)
...@@ -55,7 +55,7 @@ block_reduce(__half2* buffer, index_int batch_item_num, index_int tid, index_int ...@@ -55,7 +55,7 @@ block_reduce(__half2* buffer, index_int batch_item_num, index_int tid, index_int
} }
__global__ void __global__ void
softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, void* data_out) softmax_kernel_half2(void* data_in, index_int batch_item_num, index_int block_size, void* data_out)
{ {
__half2* input = reinterpret_cast<__half2*>(data_in); __half2* input = reinterpret_cast<__half2*>(data_in);
__half2* output = reinterpret_cast<__half2*>(data_out); __half2* output = reinterpret_cast<__half2*>(data_out);
...@@ -73,7 +73,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo ...@@ -73,7 +73,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo
} }
auto batch_max = auto batch_max =
block_reduce(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_max{}); block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_max{});
for(int i = threadIdx.x; i < batch_item_num; i += block_size) for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{ {
...@@ -82,7 +82,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo ...@@ -82,7 +82,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo
} }
auto batch_sum = auto batch_sum =
block_reduce(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{}); block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{});
for(int i = threadIdx.x; i < batch_item_num; i += block_size) for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{ {
...@@ -93,7 +93,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo ...@@ -93,7 +93,7 @@ softmax_kernel(void* data_in, index_int batch_item_num, index_int block_size, vo
// in_data is in shared memory // in_data is in shared memory
template <class Op> template <class Op>
__device__ __half __device__ __half
block_reduce2(__half* data, index_int batch_item_num, index_int tid, index_int block_size, Op op) block_reduce_half(__half* data, index_int batch_item_num, index_int tid, index_int block_size, Op op)
{ {
__syncthreads(); __syncthreads();
for(index_int s = block_size / 2; s > 0; s >>= 1) for(index_int s = block_size / 2; s > 0; s >>= 1)
...@@ -109,7 +109,7 @@ block_reduce2(__half* data, index_int batch_item_num, index_int tid, index_int b ...@@ -109,7 +109,7 @@ block_reduce2(__half* data, index_int batch_item_num, index_int tid, index_int b
} }
__global__ void __global__ void
softmax_kernel2(void* data_in, index_int batch_item_num, index_int block_size, void* data_out) softmax_kernel_half(void* data_in, index_int batch_item_num, index_int block_size, void* data_out)
{ {
__half* input = reinterpret_cast<__half*>(data_in); __half* input = reinterpret_cast<__half*>(data_in);
__half* output = reinterpret_cast<__half*>(data_out); __half* output = reinterpret_cast<__half*>(data_out);
...@@ -125,14 +125,14 @@ softmax_kernel2(void* data_in, index_int batch_item_num, index_int block_size, v ...@@ -125,14 +125,14 @@ softmax_kernel2(void* data_in, index_int batch_item_num, index_int block_size, v
in_data_reduce[i] = d; in_data_reduce[i] = d;
} }
auto batch_max = block_reduce2(in_data_reduce, batch_item_num, threadIdx.x, block_size, max{}); auto batch_max = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size, max{});
for(int i = threadIdx.x; i < batch_item_num; i += block_size) for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{ {
in_data[i] = __float2half(::exp(__half2float(in_data[i]) - __half2float(batch_max))); in_data[i] = __float2half(::exp(__half2float(in_data[i]) - __half2float(batch_max)));
in_data_reduce[i] = in_data[i]; in_data_reduce[i] = in_data[i];
} }
auto batch_sum = block_reduce2(in_data_reduce, batch_item_num, threadIdx.x, block_size, sum{}); auto batch_sum = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size, sum{});
for(int i = threadIdx.x; i < batch_item_num; i += block_size) for(int i = threadIdx.x; i < batch_item_num; i += block_size)
{ {
output[i + start] = __float2half(__half2float(in_data[i]) / __half2float(batch_sum)); output[i + start] = __float2half(__half2float(in_data[i]) / __half2float(batch_sum));
...@@ -161,7 +161,7 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in ...@@ -161,7 +161,7 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
int block_num = batch_shape.elements(); int block_num = batch_shape.elements();
int shared_size = batch_item_num * 2 * result.get_shape().type_size(); int shared_size = batch_item_num * 2 * result.get_shape().type_size();
half2_block_size = half2_block_size / 4; half2_block_size = half2_block_size / 4;
softmax_kernel<<<block_num, half2_block_size, shared_size, stream>>>( softmax_kernel_half2<<<block_num, half2_block_size, shared_size, stream>>>(
arg.data(), batch_item_num, half2_block_size, result.data()); arg.data(), batch_item_num, half2_block_size, result.data());
} }
else else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment