Unverified Commit cf50dc7c authored by Deyu Fu's avatar Deyu Fu Committed by GitHub
Browse files

Changes to make xentropysoftmax load/store vectorized when possible: (#725)

* Changes to make xentropysoftmax load/store vectorized when possible:
Increase default ILP so that each thread handle 16 Bytes data in one step
Make thread load/store longest vector possible
Make unroll case handle adjacent data instead of strided, so same order compare to vector case

* Add shift for not aligned case. Remove less than 16 bytes aligned access
parent 17ee854e
...@@ -70,7 +70,6 @@ ...@@ -70,7 +70,6 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
...@@ -84,6 +83,8 @@ ...@@ -84,6 +83,8 @@
#include "type_shim.h" #include "type_shim.h"
#include "compat.h" #include "compat.h"
#define ALIGN_BYTES 16
using Tensor = at::Tensor; using Tensor = at::Tensor;
using TensorList = at::TensorList; using TensorList = at::TensorList;
using ScalarType = at::ScalarType; using ScalarType = at::ScalarType;
...@@ -123,7 +124,7 @@ const int max_threads = 1024; ...@@ -123,7 +124,7 @@ const int max_threads = 1024;
inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
uint64_t block_size = 1; uint64_t block_size = 1;
uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads)); uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
while (block_size < max_block_size) block_size *= 2; while (block_size < (max_block_size/2)) block_size *= 2;
// Launch at least a single warp - the kernel assumes that. // Launch at least a single warp - the kernel assumes that.
block_size = std::max(block_size, static_cast<uint64_t>(32)); block_size = std::max(block_size, static_cast<uint64_t>(32));
return dim3(block_size); return dim3(block_size);
...@@ -287,29 +288,40 @@ blockReduce(AccumT* smem, ...@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT> template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
__device__ __forceinline__ AccumT __device__ __forceinline__ AccumT
ilpReduce(T* data, ilpReduce(int shift,
T* data,
int size, int size,
const Reduction<T, AccumT>& r, const Reduction<T, AccumT>& r,
AccumT defaultVal) AccumT defaultVal)
{ {
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
AccumT threadVal = defaultVal; AccumT threadVal = defaultVal;
int offset = threadIdx.x; int offset = threadIdx.x;
// shift and do 1
if(shift > 0){
data -= shift;
size += shift;
if(threadIdx.x >= shift){
threadVal = r(threadVal, data[offset]);
}
size -= blockDim.x;
data += blockDim.x;
}
int last = size % (ILP * blockDim.x); int last = size % (ILP * blockDim.x);
// Body (unroll by ILP times) T v[ILP];
for (; offset < size - last; offset += blockDim.x * ILP) { LoadT* value = reinterpret_cast<LoadT*>(&v);
T tmp[ILP];
#pragma unroll for (; offset * ILP < (size - last); offset += blockDim.x) {
for (int j = 0; j < ILP; ++j) *value = reinterpret_cast<LoadT*>(data)[offset];
tmp[j] = data[offset + j * blockDim.x];
#pragma unroll for (int j = 0; j < ILP; ++j) {
for (int j = 0; j < ILP; ++j) threadVal = r(threadVal, v[j]);
threadVal = r(threadVal, tmp[j]); }
} }
offset = size - last + threadIdx.x;
// Epilogue // Epilogue
for (; offset < size; offset += blockDim.x) for (; offset < size; offset += blockDim.x)
threadVal = r(threadVal, data[offset]); threadVal = r(threadVal, data[offset]);
...@@ -319,7 +331,8 @@ ilpReduce(T* data, ...@@ -319,7 +331,8 @@ ilpReduce(T* data,
template <template<typename, typename> class Reduction1, template<typename, typename> class Reduction2, int ILP, typename T, typename AccumT> template <template<typename, typename> class Reduction1, template<typename, typename> class Reduction2, int ILP, typename T, typename AccumT>
__device__ __forceinline__ void __device__ __forceinline__ void
ilpReduce(T* data, ilpReduce(int shift,
T* data,
int size, int size,
AccumT* reducVal1, AccumT* reducVal1,
const Reduction1<T, AccumT>& r1, const Reduction1<T, AccumT>& r1,
...@@ -328,27 +341,38 @@ ilpReduce(T* data, ...@@ -328,27 +341,38 @@ ilpReduce(T* data,
const Reduction2<T, AccumT>& r2, const Reduction2<T, AccumT>& r2,
AccumT defaultVal2) AccumT defaultVal2)
{ {
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
AccumT threadVal1 = defaultVal1; AccumT threadVal1 = defaultVal1;
AccumT threadVal2 = defaultVal2; AccumT threadVal2 = defaultVal2;
int offset = threadIdx.x; int offset = threadIdx.x;
// shift and do 1
if(shift > 0){
data -= shift;
size += shift;
if(threadIdx.x >= shift){
threadVal1 = r1(threadVal1, data[offset]);
threadVal2 = r2(threadVal2, data[offset]);
}
size -= blockDim.x;
data += blockDim.x;
}
int last = size % (ILP * blockDim.x); int last = size % (ILP * blockDim.x);
// Body (unroll by ILP times) T v[ILP];
for (; offset < size - last; offset += blockDim.x * ILP) { LoadT* value = reinterpret_cast<LoadT*>(&v);
T tmp[ILP];
#pragma unroll for (; offset * ILP < (size - last); offset += blockDim.x) {
for (int j = 0; j < ILP; ++j) *value = reinterpret_cast<LoadT*>(data)[offset];
tmp[j] = data[offset + j * blockDim.x];
#pragma unroll
for (int j = 0; j < ILP; ++j) { for (int j = 0; j < ILP; ++j) {
threadVal1 = r1(threadVal1, tmp[j]); threadVal1 = r1(threadVal1, v[j]);
threadVal2 = r2(threadVal2, tmp[j]); threadVal2 = r2(threadVal2, v[j]);
} }
} }
offset = size - last + threadIdx.x;
// Epilogue // Epilogue
for (; offset < size; offset += blockDim.x) { for (; offset < size; offset += blockDim.x) {
threadVal1 = r1(threadVal1, data[offset]); threadVal1 = r1(threadVal1, data[offset]);
...@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward( ...@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
// each block handles a sample in the mini-batch // each block handles a sample in the mini-batch
input += blockIdx.x * classes; input += blockIdx.x * classes;
//output += blockIdx.x * classes; //output += blockIdx.x * classes;
const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
int64_t label = labels[blockIdx.x]; int64_t label = labels[blockIdx.x];
// find the max and sum // find the max and sum
accscalar_t threadMax, threadSum, max_k, sum_k; accscalar_t threadMax, threadSum, max_k, sum_k;
ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>( ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>(
input, classes, shift, input, classes,
&threadMax, MaxFloat<scalar_t, accscalar_t>(), &threadMax, MaxFloat<scalar_t, accscalar_t>(),
-at::numeric_limits<accscalar_t>::max(), -at::numeric_limits<accscalar_t>::max(),
&threadSum, AddFloat<scalar_t, accscalar_t>(), &threadSum, AddFloat<scalar_t, accscalar_t>(),
static_cast<accscalar_t>(0)); static_cast<accscalar_t>(0));
blockReduce<Max, Add, accscalar_t>( blockReduce<Max, Add, accscalar_t>(
sdata, sdata,
&max_k, threadMax, Max<accscalar_t>(), &max_k, threadMax, Max<accscalar_t>(),
...@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward( ...@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
&sum_k, threadSum, Add<accscalar_t>(), &sum_k, threadSum, Add<accscalar_t>(),
static_cast<accscalar_t>(0)); static_cast<accscalar_t>(0));
// reduce all values accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
accscalar_t sumAll = blockReduce<Add, accscalar_t>( accscalar_t sumAll = blockReduce<Add, accscalar_t>(
sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0)); sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
...@@ -411,10 +435,9 @@ cunn_SoftMaxXEntropyForward( ...@@ -411,10 +435,9 @@ cunn_SoftMaxXEntropyForward(
} }
} }
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue> template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
__global__ void __device__ __forceinline__ void
cunn_SoftMaxXEntropyBackward( apply(scalar_t *gradInput,
scalar_t *gradInput,
scalar_t *logits, scalar_t *logits,
outscalar_t *max_log_sum_exp, outscalar_t *max_log_sum_exp,
outscalar_t *gradOutput, outscalar_t *gradOutput,
...@@ -422,9 +445,6 @@ cunn_SoftMaxXEntropyBackward( ...@@ -422,9 +445,6 @@ cunn_SoftMaxXEntropyBackward(
const float smoothing, const float smoothing,
int classes) int classes)
{ {
gradInput += blockIdx.x * classes;
logits += blockIdx.x * classes;
accscalar_t smooth_positives = 1.0 - smoothing; accscalar_t smooth_positives = 1.0 - smoothing;
accscalar_t smooth_negatives = smoothing / classes; accscalar_t smooth_negatives = smoothing / classes;
accscalar_t tmpGradOutput = gradOutput[blockIdx.x]; accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
...@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward( ...@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
int offset = threadIdx.x; int offset = threadIdx.x;
int last = classes % (ILP * blockDim.x); int last = classes % (ILP * blockDim.x);
for (; offset < classes - last; offset += blockDim.x * ILP) { for (; offset < classes - last; offset += blockDim.x * ILP) {
accscalar_t tmpLogits[ILP]; accscalar_t tmpLogits[ILP];
...@@ -457,9 +478,99 @@ cunn_SoftMaxXEntropyBackward( ...@@ -457,9 +478,99 @@ cunn_SoftMaxXEntropyBackward(
} }
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
__device__ __forceinline__ void
aligned_apply(int shift,
scalar_t *gradInput,
scalar_t *logits,
outscalar_t *max_log_sum_exp,
outscalar_t *gradOutput,
int64_t *labels,
const float smoothing,
int classes)
{
accscalar_t smooth_positives = 1.0 - smoothing;
accscalar_t smooth_negatives = smoothing / classes;
accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
int64_t label = labels[blockIdx.x];
accscalar_t coeff = max_log_sum_exp[blockIdx.x];
int offset = threadIdx.x;
// shift and do 1
if(shift > 0){
logits -= shift;
gradInput -= shift;
classes += shift;
if(threadIdx.x >= shift){
gradInput[offset] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(logits[offset]) - coeff) -
static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
classes -= blockDim.x;
gradInput += blockDim.x;
logits += blockDim.x;
shift -= blockDim.x;
}
int last = classes % (ILP * blockDim.x);
typedef typename std::aligned_storage<ILP*sizeof(scalar_t), ILP*alignof(scalar_t)>::type LoadT;
// input
scalar_t v[ILP];
LoadT* value = reinterpret_cast<LoadT*>(&v);
// output
scalar_t r[ILP];
LoadT* result = reinterpret_cast<LoadT*>(&r);
for (; offset * ILP < (classes - last); offset += blockDim.x) {
*value = reinterpret_cast<LoadT*>(logits)[offset];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
r[j] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(v[j]) - coeff) -
static_cast<accscalar_t>(((ILP * offset + j - shift) == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
reinterpret_cast<LoadT*>(gradInput)[offset] = *result;
}
offset = classes - last + threadIdx.x;
for (; offset < classes; offset += blockDim.x)
gradInput[offset] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(logits[offset]) - coeff) -
static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
__global__ void
cunn_SoftMaxXEntropyBackward(
scalar_t *gradInput,
scalar_t *logits,
outscalar_t *max_log_sum_exp,
outscalar_t *gradOutput,
int64_t *labels,
const float smoothing,
int classes)
{
gradInput += blockIdx.x * classes;
logits += blockIdx.x * classes;
// Do vectorized load/store when input/output have same alignment
const int shift = ((uint64_t)logits) % ALIGN_BYTES / sizeof(scalar_t);
const int shift_ = ((uint64_t)gradInput) % ALIGN_BYTES / sizeof(scalar_t);
if (shift == shift_){
aligned_apply<ILP, scalar_t, accscalar_t, outscalar_t>(shift, gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes);
}
else {
apply<ILP, scalar_t, accscalar_t, outscalar_t>(gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes);
}
}
template<template<typename, typename, typename> class Epilogue> template<template<typename, typename, typename> class Epilogue>
std::vector<Tensor> host_softmax_xentropy( std::vector<Tensor> host_softmax_xentropy(
...@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy( ...@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
// XXX: it assumes that inner_size == 1 // XXX: it assumes that inner_size == 1
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported"); TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size); dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
using namespace at; using namespace at;
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "host_softmax_xentropy", DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "host_softmax_xentropy",
using accscalar_t = at::acc_type<scalar_t_0, true>; using accscalar_t = at::acc_type<scalar_t_0, true>;
const int ILP = sizeof(float4)/sizeof(scalar_t_0);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
if (!half_to_float) { if (!half_to_float) {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue> cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>( <<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
...@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward( ...@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported"); TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size); dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
DISPATCH_FLOAT_AND_HALF(gI.scalar_type(), 0, "host_softmax_xentropy_backward", DISPATCH_FLOAT_AND_HALF(gI.scalar_type(), 0, "host_softmax_xentropy_backward",
using accscalar_t = acc_type<scalar_t_0, true>; using accscalar_t = acc_type<scalar_t_0, true>;
const int ILP = sizeof(float4)/sizeof(scalar_t_0);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
if (!half_to_float) { if (!half_to_float) {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue> cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>( <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment