Merge branch 'opt_log_softmax' into argmax_min

1af47f3e · Shucai Xiao · 099e9ce8 · 5384c7d7 · 1af47f3e · 1af47f3e
Commit 1af47f3e authored Jun 24, 2019 by Shucai Xiao
3 changed files
--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -76,13 +76,13 @@ device_type<T>* device_cast(T* x)
 }

 template <class T>
-T to_hip_type(T x)
+__device__ __host__ T to_hip_type(T x)
 {
    return x;
 }

 // Hip doens't support __fp16
-inline float to_hip_type(gpu_half x) { return x; }
+inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
@@ -30,41 +30,102 @@ argument logsoftmax(hipStream_t stream,
            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
            hip_tensor_descriptor<n_dim> desc_data(output_shape);

-            // each thread is for one item in the batch
-            gs_launch(stream, batch_shape.elements())([=](auto i) {
-                auto batch_idx = desc_batch.multi(i);
+            // use one block for items in one batch.
+            // opt 1, load all data to lds then use the same approach as
+            // the current optimization
+            const size_t block_size = 1024;
+            launch(
+                stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
+                size_t thr_idx = idx.local;
+                size_t blk_idx = idx.group;
+                using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
+
+                // all data can be loaded to the lds once, so all operations are
+                // done in lds
+                MIGRAPHX_DEVICE_SHARED type lds_data[block_size + 2];
+                auto batch_idx = desc_batch.multi(blk_idx);
                auto data_idx  = batch_idx;
+                // load data to lds and compute the batch max
+                size_t item_num      = num_in_batch;
+                lds_data[block_size] = input_ptr[0];
+                for(size_t i = thr_idx; i < num_in_batch; i += block_size)
+                {
+                    data_idx[axis] = i;
+                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)];
+
+                    __syncthreads();
+
+                    auto size   = (item_num > block_size) ? block_size : item_num;
+                    auto stride = (size + 1) / 2;
+                    while(true)
+                    {
+                        if(thr_idx + stride < size)
+                        {
+                            lds_data[thr_idx] = ::max(to_hip_type(lds_data[thr_idx]),
+                                                      to_hip_type(lds_data[thr_idx + stride]));
+                        }
+                        __syncthreads();
+                        size   = stride;
+                        stride = (stride + 1) / 2;
+
+                        if(size == 1)
+                            break;
+                    }

-                // get max
-                auto batch_max = input_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < num_in_batch; ++j)
+                    if(thr_idx == 0)
                    {
-                    data_idx[axis] = j;
-                    size_t idx     = desc_data.linear(data_idx);
-                    batch_max      = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[idx]));
+                        lds_data[block_size] = (lds_data[0] < lds_data[block_size])
+                                                   ? lds_data[block_size]
+                                                   : lds_data[0];
                    }
+                    __syncthreads();

-                for(std::size_t j = 0; j < num_in_batch; ++j)
+                    item_num -= block_size;
+                }
+
+                const size_t block_size1 = block_size + 1;
+                lds_data[block_size1]    = 0;
+                item_num                 = num_in_batch;
+                for(size_t i = thr_idx; i < num_in_batch; i += block_size)
+                {
+                    data_idx[axis] = i;
+                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)] - lds_data[block_size];
+                    lds_data[i]    = ::exp(to_hip_type(lds_data[i]));
+
+                    __syncthreads();
+
+                    auto size   = (item_num > block_size) ? block_size : item_num;
+                    auto stride = (size + 1) / 2;
+                    while(true)
                    {
-                    data_idx[axis]  = j;
-                    size_t idx      = desc_data.linear(data_idx);
-                    output_ptr[idx] = input_ptr[idx] - batch_max;
+                        if(thr_idx + stride < size)
+                        {
+                            lds_data[thr_idx] += lds_data[thr_idx + stride];
+                        }
+                        __syncthreads();
+                        size   = stride;
+                        stride = (stride + 1) / 2;
+                        if(size == 1)
+                            break;
                    }

-                auto batch_sum = ::exp(to_hip_type(output_ptr[desc_data.linear(batch_idx)]));
-                for(std::size_t j = 1; j < num_in_batch; ++j)
+                    if(thr_idx == 0)
                    {
-                    data_idx[axis] = j;
-                    size_t idx     = desc_data.linear(data_idx);
-                    batch_sum += ::exp(to_hip_type(output_ptr[idx]));
+                        lds_data[block_size1] += lds_data[0];
+                    }
+                    __syncthreads();
+
+                    item_num -= block_size;
                }
-                batch_sum = ::log(to_hip_type(batch_sum));

-                for(std::size_t j = 0; j < num_in_batch; ++j)
+                auto log_batch_sum =
+                    ::log(to_hip_type(lds_data[block_size1])) + lds_data[block_size];
+                item_num = num_in_batch;
+                for(size_t i = thr_idx; i < num_in_batch; i += block_size)
                {
-                    data_idx[axis] = j;
-                    size_t idx     = desc_data.linear(data_idx);
-                    output_ptr[idx] -= batch_sum;
+                    data_idx[axis]    = i;
+                    size_t index      = desc_data.linear(data_idx);
+                    output_ptr[index] = input_ptr[index] - log_batch_sum;
                }
            });
        });

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -30,45 +30,98 @@ argument softmax(hipStream_t stream,
            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
            hip_tensor_descriptor<n_dim> desc_data(output_shape);

-            // each thread is for one item in the batch
-            gs_launch(stream, batch_shape.elements())([=](auto i) {
-                auto batch_idx = desc_batch.multi(i);
+            // use one block for items in one batch.
+            const size_t block_size = 1024;
+            launch(
+                stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
+                size_t thr_idx = idx.local;
+                size_t blk_idx = idx.group;
+                using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
+
+                // all data can be loaded to the lds once, so all operations are
+                // done in lds
+                MIGRAPHX_DEVICE_SHARED type lds_data[block_size + 2];
+                auto batch_idx = desc_batch.multi(blk_idx);
                auto data_idx  = batch_idx;
-                // get max
-                auto batch_max = input_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < n_dims; ++j)
+                // load data to lds and compute the batch max
+                size_t item_num      = n_dims;
+                lds_data[block_size] = input_ptr[0];
+                for(size_t i = thr_idx; i < n_dims; i += block_size)
+                {
+                    data_idx[axis] = i;
+                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)];
+
+                    __syncthreads();
+
+                    auto size   = (item_num > block_size) ? block_size : item_num;
+                    auto stride = (size + 1) / 2;
+                    while(true)
                    {
-                    data_idx[axis] = j;
-                    batch_max      = std::max(to_hip_type(batch_max),
-                                         to_hip_type(input_ptr[desc_data.linear(data_idx)]));
+                        if(thr_idx + stride < size)
+                        {
+                            lds_data[thr_idx] = ::max(to_hip_type(lds_data[thr_idx]),
+                                                      to_hip_type(lds_data[thr_idx + stride]));
+                        }
+                        __syncthreads();
+                        size   = stride;
+                        stride = (stride + 1) / 2;
+
+                        if(size == 1)
+                            break;
                    }

-                for(std::size_t j = 0; j < n_dims; ++j)
+                    if(thr_idx == 0)
                    {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = input_ptr[idx] - batch_max;
+                        lds_data[block_size] = (lds_data[0] < lds_data[block_size])
+                                                   ? lds_data[block_size]
+                                                   : lds_data[0];
+                    }
+                    __syncthreads();
+
+                    item_num -= block_size;
                }

-                for(std::size_t j = 0; j < n_dims; ++j)
+                const size_t block_size1 = block_size + 1;
+                lds_data[block_size1]    = 0;
+                item_num                 = n_dims;
+                for(size_t i = thr_idx; i < n_dims; i += block_size)
+                {
+                    data_idx[axis] = i;
+                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)] - lds_data[block_size];
+                    lds_data[i]    = ::exp(to_hip_type(lds_data[i]));
+
+                    __syncthreads();
+
+                    auto size   = (item_num > block_size) ? block_size : item_num;
+                    auto stride = (size + 1) / 2;
+                    while(true)
                    {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = exp(to_hip_type(output_ptr[idx]));
+                        if(thr_idx + stride < size)
+                        {
+                            lds_data[thr_idx] += lds_data[thr_idx + stride];
+                        }
+                        __syncthreads();
+                        size   = stride;
+                        stride = (stride + 1) / 2;
+                        if(size == 1)
+                            break;
                    }

-                auto batch_sum = output_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < n_dims; ++j)
+                    if(thr_idx == 0)
                    {
-                    data_idx[axis] = j;
-                    batch_sum += output_ptr[desc_data.linear(data_idx)];
+                        lds_data[block_size1] += lds_data[0];
+                    }
+                    __syncthreads();
+
+                    item_num -= block_size;
                }

-                for(std::size_t j = 0; j < n_dims; ++j)
+                for(size_t i = thr_idx; i < n_dims; i += block_size)
                {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = output_ptr[idx] / batch_sum;
+                    data_idx[axis]    = i;
+                    size_t index      = desc_data.linear(data_idx);
+                    auto val          = input_ptr[index] - lds_data[block_size];
+                    output_ptr[index] = ::exp(to_hip_type(val)) / lds_data[block_size1];
                }
            });
        });