Commit 0c74571f authored by Wil Kong's avatar Wil Kong Committed by mcarilli
Browse files

Add softmax cross entropy loss with label smoothing support. (#295)

* Add softmax cross entropy loss with label smoothing support.

* Fix deprecation of AT_DISPATCH_XXX and several minor issues.

* Fix issues commented by reviewers.

* Add FB license.

* Remove code generation constraints.

* Add a simple unittest for label smoothing.
parent 3e2883dd
#include <torch/extension.h>
// CUDA forward declarations
std::vector<at::Tensor> softmax_xentropy_cuda(
const at::Tensor &input,
const at::Tensor &labels,
const float smoothing,
const bool half_to_float);
at::Tensor softmax_xentropy_backward_cuda(
const at::Tensor &grad_loss,
const at::Tensor &logits,
const at::Tensor &max_log_sum_exp,
const at::Tensor &labels,
const float smoothing);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
std::vector<at::Tensor> softmax_xentropy_forward(
const at::Tensor &input,
const at::Tensor &labels,
const float smoothing,
const bool half_to_float) {
CHECK_CUDA(input);
CHECK_INPUT(labels);
return softmax_xentropy_cuda(input, labels, smoothing, half_to_float);
}
at::Tensor softmax_xentropy_backward(
const at::Tensor &grad_loss,
const at::Tensor &logits,
const at::Tensor &max_log_sum_exp,
const at::Tensor &labels,
const float smoothing) {
CHECK_CUDA(grad_loss);
CHECK_CUDA(logits);
CHECK_INPUT(max_log_sum_exp);
CHECK_INPUT(labels);
return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, smoothing);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)");
m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)");
}
/**
* From PyTorch:
*
* Copyright (c) 2016- Facebook, Inc (Adam Paszke)
* Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
* Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
* Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
* Copyright (c) 2011-2013 NYU (Clement Farabet)
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
*
* From Caffe2:
*
* Copyright (c) 2016-present, Facebook Inc. All rights reserved.
*
* All contributions by Facebook:
* Copyright (c) 2016 Facebook Inc.
*
* All contributions by Google:
* Copyright (c) 2015 Google Inc.
* All rights reserved.
*
* All contributions by Yangqing Jia:
* Copyright (c) 2015 Yangqing Jia
* All rights reserved.
*
* All contributions from Caffe:
* Copyright(c) 2013, 2014, 2015, the respective contributors
* All rights reserved.
*
* All other contributions:
* Copyright(c) 2015, 2016 the respective contributors
* All rights reserved.
*
* Caffe2 uses a copyright model similar to Caffe: each contributor holds
* copyright over their contributions to Caffe2. The project versioning records
* all such contribution and copyright details. If a contributor wants to further
* mark their specific copyright on a particular contribution, they should
* indicate their copyright solely in the commit message of the change when it is
* committed.
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
* and IDIAP Research Institute nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/NumericLimits.cuh>
#include <THC/THC.h>
#include <THC/THCGeneral.h>
#include <THC/THCThrustAllocator.cuh>
#include "type_shim.h"
using Tensor = at::Tensor;
using TensorList = at::TensorList;
using ScalarType = at::ScalarType;
using at::acc_type;
template<typename T, typename AccumT, typename OutT>
struct LogSoftMaxForwardEpilogue {
__device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
: logsum(max_input + std::log(sum)) {}
__device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_log_sum_exp)
: logsum(max_log_sum_exp) {}
__device__ __forceinline__ OutT operator()(T input) const {
return static_cast<OutT>(input - logsum);
}
const AccumT logsum;
};
template<typename T, typename AccumT, typename OutT>
struct LogSoftMaxBackwardEpilogue {
__device__ __forceinline__ LogSoftMaxBackwardEpilogue(AccumT sum)
: sum(sum) {}
__device__ __forceinline__ T operator()(OutT gradOutput, OutT output) const {
return static_cast<T>(gradOutput - std::exp(static_cast<AccumT>(output)) * sum);
}
const AccumT sum;
};
const int max_threads = 1024;
inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
uint64_t block_size = 1;
uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
while (block_size < max_block_size) block_size *= 2;
// Launch at least a single warp - the kernel assumes that.
block_size = std::max(block_size, static_cast<uint64_t>(32));
return dim3(block_size);
}
template<typename T>
struct Add {
__device__ __forceinline__ T operator()(T a, T b) const {
return a + b;
}
};
template<typename T>
struct Max {
__device__ __forceinline__ T operator()(T a, T b) const {
return a < b ? b : a;
}
};
////////////////////////////////////////////////////////////////////////////////
// Regular kernel (fast when dim_size is large; requires inner_size == 1)
////////////////////////////////////////////////////////////////////////////////
template <typename T, typename AccumT>
struct MaxFloat
{
__device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
return ::max(max, (AccumT)v);
}
};
template<typename T, typename AccumT>
struct AddFloat
{
__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
return sum + v;
}
};
template<typename T, typename AccumT>
struct SumExpFloat
{
__device__ __forceinline__ SumExpFloat(AccumT v)
: max_k(v) {}
__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
return sum + std::exp(v - max_k);
}
const AccumT max_k;
};
template <template<typename> class Reduction, typename AccumT>
__device__ __forceinline__ AccumT
blockReduce(AccumT* smem, AccumT val,
const Reduction<AccumT>& r,
AccumT defaultVal)
{
// To avoid RaW races from chaining blockReduce calls together, we need a sync here
__syncthreads();
smem[threadIdx.x] = val;
__syncthreads();
AccumT warpVal = defaultVal;
// First warp will perform per-warp reductions for the remaining warps
uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1;
if (threadIdx.x < 32) {
int lane = threadIdx.x % 32;
if (lane < blockDim.x / 32) {
#pragma unroll
for (int i = 0; i < 32; ++i) {
warpVal = r(warpVal, smem[lane * 32 + i]);
}
__syncwarp(mask);
smem[lane] = warpVal;
}
}
__syncthreads();
// First thread will perform a reduction of the above per-warp reductions
AccumT blockVal = defaultVal;
if (threadIdx.x == 0) {
for (int i = 0; i < blockDim.x / 32; ++i) {
blockVal = r(blockVal, smem[i]);
}
smem[0] = blockVal;
}
// Sync and broadcast
__syncthreads();
return smem[0];
}
template <template<typename> class Reduction1, template<typename> class Reduction2, typename AccumT>
__device__ __forceinline__ void
blockReduce(AccumT* smem,
AccumT* reducVal1,
AccumT val1,
const Reduction1<AccumT>& r1,
AccumT defaultVal1,
AccumT* reducVal2,
AccumT val2,
const Reduction2<AccumT>& r2,
AccumT defaultVal2)
{
// To avoid RaW races from chaining blockReduce calls together, we need a sync here
__syncthreads();
smem[threadIdx.x] = val1;
smem[blockDim.x + threadIdx.x] = val2;
__syncthreads();
AccumT warpVal1 = defaultVal1;
AccumT warpVal2 = defaultVal2;
// First warp will perform per-warp reductions for the remaining warps
uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1;
if (threadIdx.x < 32) {
int lane = threadIdx.x % 32;
if (lane < blockDim.x / 32) {
#pragma unroll
for (int i = 0; i < 32; ++i) {
warpVal1 = r1(warpVal1, smem[lane * 32 + i]);
warpVal2 = r2(warpVal2, smem[lane * 32 + i + blockDim.x]);
}
__syncwarp(mask);
smem[lane] = warpVal1;
smem[lane + blockDim.x] = warpVal2;
}
}
__syncthreads();
// First thread will perform a reduction of the above per-warp reductions
AccumT blockVal1 = defaultVal1;
AccumT blockVal2 = defaultVal2;
if (threadIdx.x == 0) {
for (int i = 0; i < blockDim.x / 32; ++i) {
blockVal1 = r1(blockVal1, smem[i]);
blockVal2 = r2(blockVal2, smem[i + blockDim.x]);
}
smem[0] = blockVal1;
smem[blockDim.x] = blockVal2;
}
// Sync and broadcast
__syncthreads();
*reducVal1 = smem[0];
*reducVal2 = smem[blockDim.x];
__syncthreads();
}
template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
__device__ __forceinline__ AccumT
ilpReduce(T* data,
int size,
const Reduction<T, AccumT>& r,
AccumT defaultVal)
{
AccumT threadVal = defaultVal;
int offset = threadIdx.x;
int last = size % (ILP * blockDim.x);
// Body (unroll by ILP times)
for (; offset < size - last; offset += blockDim.x * ILP) {
T tmp[ILP];
#pragma unroll
for (int j = 0; j < ILP; ++j)
tmp[j] = data[offset + j * blockDim.x];
#pragma unroll
for (int j = 0; j < ILP; ++j)
threadVal = r(threadVal, tmp[j]);
}
// Epilogue
for (; offset < size; offset += blockDim.x)
threadVal = r(threadVal, data[offset]);
return threadVal;
}
template <template<typename, typename> class Reduction1, template<typename, typename> class Reduction2, int ILP, typename T, typename AccumT>
__device__ __forceinline__ void
ilpReduce(T* data,
int size,
AccumT* reducVal1,
const Reduction1<T, AccumT>& r1,
AccumT defaultVal1,
AccumT* reducVal2,
const Reduction2<T, AccumT>& r2,
AccumT defaultVal2)
{
AccumT threadVal1 = defaultVal1;
AccumT threadVal2 = defaultVal2;
int offset = threadIdx.x;
int last = size % (ILP * blockDim.x);
// Body (unroll by ILP times)
for (; offset < size - last; offset += blockDim.x * ILP) {
T tmp[ILP];
#pragma unroll
for (int j = 0; j < ILP; ++j)
tmp[j] = data[offset + j * blockDim.x];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
threadVal1 = r1(threadVal1, tmp[j]);
threadVal2 = r2(threadVal2, tmp[j]);
}
}
// Epilogue
for (; offset < size; offset += blockDim.x) {
threadVal1 = r1(threadVal1, data[offset]);
threadVal2 = r2(threadVal2, data[offset]);
}
*reducVal1 = threadVal1;
*reducVal2 = threadVal2;
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
__global__ void
cunn_SoftMaxXEntropyForward(
accscalar_t *losses,
outscalar_t *max_log_sum_exp,
scalar_t *input,
int64_t *labels,
int64_t classes,
const float smoothing)
{
extern __shared__ unsigned char smem[];
auto sdata = reinterpret_cast<accscalar_t*>(smem);
// forward pointers to batch[blockIdx.x]
// each block handles a sample in the mini-batch
input += blockIdx.x * classes;
//output += blockIdx.x * classes;
int64_t label = labels[blockIdx.x];
// find the max and sum
accscalar_t threadMax, threadSum, max_k, sum_k;
ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>(
input, classes,
&threadMax, MaxFloat<scalar_t, accscalar_t>(),
-at::numeric_limits<accscalar_t>::max(),
&threadSum, AddFloat<scalar_t, accscalar_t>(),
static_cast<accscalar_t>(0));
blockReduce<Max, Add, accscalar_t>(
sdata,
&max_k, threadMax, Max<accscalar_t>(),
-at::numeric_limits<accscalar_t>::max(),
&sum_k, threadSum, Add<accscalar_t>(),
static_cast<accscalar_t>(0));
// reduce all values
accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
accscalar_t sumAll = blockReduce<Add, accscalar_t>(
sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
// calculate per element loss with label smoothing
// reserve max + log_sum_exp for bprop
if (threadIdx.x == 0) {
accscalar_t log_prob = epilogue(static_cast<accscalar_t>(input[label]));
losses[blockIdx.x] = (max_k + std::log(sumAll) - sum_k / classes) \
* smoothing - log_prob * (1 - smoothing);
max_log_sum_exp[blockIdx.x] = max_k + std::log(sumAll);
}
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
__global__ void
cunn_SoftMaxXEntropyBackward(
scalar_t *gradInput,
scalar_t *logits,
outscalar_t *max_log_sum_exp,
outscalar_t *gradOutput,
int64_t *labels,
const float smoothing,
int classes)
{
gradInput += blockIdx.x * classes;
logits += blockIdx.x * classes;
accscalar_t smooth_positives = 1.0 - smoothing;
accscalar_t smooth_negatives = smoothing / classes;
accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
int64_t label = labels[blockIdx.x];
accscalar_t coeff = max_log_sum_exp[blockIdx.x];
int offset = threadIdx.x;
int last = classes % (ILP * blockDim.x);
for (; offset < classes - last; offset += blockDim.x * ILP) {
accscalar_t tmpLogits[ILP];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
tmpLogits[j] = static_cast<accscalar_t>(logits[offset + j * blockDim.x]);
}
#pragma unroll
for (int j = 0; j < ILP; ++j)
gradInput[offset + j * blockDim.x] = tmpGradOutput * (
std::exp(tmpLogits[j] - coeff) - static_cast<accscalar_t>(
(offset + j * blockDim.x == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
for (; offset < classes; offset += blockDim.x)
gradInput[offset] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(logits[offset]) - coeff) -
static_cast<accscalar_t>((offset == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
template<template<typename, typename, typename> class Epilogue>
std::vector<Tensor> host_softmax_xentropy(
const Tensor & input_,
const Tensor & labels_,
const float smoothing,
const bool half_to_float){
if (half_to_float) AT_ASSERTM(input_.type().scalarType() == ScalarType::Half,"conversion is supported for Half type only");
AT_ASSERTM(labels_.type().scalarType() == ScalarType::Long,"Label type should be CUDA Long");
auto input = input_.contiguous();
Tensor max_log_sum_exp = at::empty_like(labels_, half_to_float ? input.options().dtype(ScalarType::Float) : input.options());
Tensor losses = at::empty_like(labels_, input_.options().dtype(ScalarType::Float));
static_assert(std::is_same<acc_type<at::Half, true>, float>::value ||
std::is_same<acc_type<at::Half, true>, double>::value,
"accscalar_t for half should be float or double");
AT_ASSERTM(input.dim() == 2, "Currently only 2 dim input supported");
AT_ASSERTM(labels_.dim() == 1, "Labels should be 1 dimensional");
AT_ASSERTM(input.size(0) == labels_.size(0), "Input and label should have same number of examples");
AT_ASSERTM(input.numel() > 0, "Number of classes in input should not be 0");
const int64_t dim = 1;
int64_t outer_size = 1;
int64_t dim_size = input.size(dim);
int64_t inner_size = 1;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
for (int64_t i = 0; i < dim; ++i)
outer_size *= input.size(i);
for (int64_t i = dim + 1; i < input.dim(); ++i)
inner_size *= input.size(i);
// This kernel spawns a block per each element in the batch.
// XXX: it assumes that inner_size == 1
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
using namespace at;
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "host_softmax_xentropy",
using accscalar_t = at::acc_type<scalar_t_0, true>;
if (!half_to_float) {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
losses.data<accscalar_t>(), max_log_sum_exp.data<scalar_t_0>(),
input.data<scalar_t_0>(), labels_.data<int64_t>(),
dim_size, smoothing
);
} else {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
losses.data<accscalar_t>(), max_log_sum_exp.data<accscalar_t>(),
input.data<scalar_t_0>(), labels_.data<int64_t>(),
dim_size, smoothing
);
}
);
THCudaCheck(cudaGetLastError());
std::vector<at::Tensor> ret = {losses, max_log_sum_exp};
return ret;
}
template<template<typename, typename, typename> class Epilogue>
Tensor host_softmax_xentropy_backward(
const at::Tensor &grad_loss,
const at::Tensor &logits_,
const at::Tensor &max_log_sum_exp,
const at::Tensor &labels,
const float smoothing,
bool half_to_float) {
const int64_t dim = 1;
Tensor gI = at::empty_like(logits_);
if (grad_loss.numel() == 0) {
return gI;
}
auto grad = grad_loss.contiguous();
auto logits = logits_.contiguous();
static_assert(std::is_same<acc_type<at::Half, true>, float>::value ||
std::is_same<acc_type<at::Half, true>, double>::value,
"accscalar_t for half should be float or double");
if (grad.dim() == 0) grad = grad.view(1);
AT_ASSERTM(logits_.dim() == 2, "Currently only 2 dim input supported");
AT_ASSERTM(labels.dim() == 1, "Labels should be 1 dimensional");
AT_ASSERTM(logits_.numel() > 0, "Number of classes in input should not be 0");
AT_ASSERTM(logits_.size(0) == labels.size(0), "Input and label should have same number of examples");
AT_ASSERTM(labels.size(0) == grad.size(0), "Label and loss should have same number of examples");
int64_t outer_size = 1;
int64_t dim_size = logits.size(dim);
int64_t inner_size = 1;
for (int64_t i = 0; i < dim; ++i)
outer_size *= logits.size(i);
for (int64_t i = dim + 1; i < logits.dim(); ++i)
inner_size *= logits.size(i);
// See descriptions of kernels above.
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
DISPATCH_FLOAT_AND_HALF(gI.scalar_type(), 0, "host_softmax_xentropy_backward",
using accscalar_t = acc_type<scalar_t_0, true>;
if (!half_to_float) {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
max_log_sum_exp.data<scalar_t_0>(),
grad.data<scalar_t_0>(), labels.data<int64_t>(),
smoothing, dim_size
);
} else {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
max_log_sum_exp.data<accscalar_t>(),
grad.data<accscalar_t>(), labels.data<int64_t>(),
smoothing, dim_size
);
}
);
THCudaCheck(cudaGetLastError());
return gI;
}
std::vector<Tensor> softmax_xentropy_cuda(const Tensor &input, const Tensor &labels, const float smoothing, const bool half_to_float){
return host_softmax_xentropy<LogSoftMaxForwardEpilogue>(input, labels, smoothing, half_to_float);
}
at::Tensor softmax_xentropy_backward_cuda(
const at::Tensor &grad_loss,
const at::Tensor &logits,
const at::Tensor &max_log_sum_exp,
const at::Tensor &labels,
const float smoothing) {
bool half_to_float = grad_loss.type().scalarType() != logits.type().scalarType();
if (half_to_float) {
AT_ASSERTM((grad_loss.type().scalarType() == ScalarType::Float && logits.type().scalarType() == ScalarType::Half), "expected input and grad types to match, or input to be at::Half and grad to be at::Float");
}
return host_softmax_xentropy_backward<LogSoftMaxBackwardEpilogue>(grad_loss, logits, max_log_sum_exp, labels, smoothing, half_to_float);
}
import torch
from apex.contrib import xentropy as label_smoothing
import unittest
import warnings
import random
import numpy as np
import time
def label_smoothing_raw(x, target, padding_idx, smoothing):
logprobs = torch.nn.functional.log_softmax(x, dim=-1, dtype=torch.float32)
non_pad_mask = (target != padding_idx)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)[non_pad_mask]
smooth_loss = -logprobs.mean(dim=-1)[non_pad_mask]
loss = (1.0 - smoothing) * nll_loss + smoothing * smooth_loss
return loss
def label_smoothing_opt_1(x, target, padding_idx, smoothing):
logprobs = torch.nn.functional.log_softmax(x, dim=-1, dtype=torch.float32)
pad_mask = (target == padding_idx)
ll_loss = logprobs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
smooth_loss = logprobs.mean(dim=-1)
loss = (smoothing - 1.0) * ll_loss - smoothing * smooth_loss
loss.masked_fill_(pad_mask, 0)
return loss
class LabelSmoothingTest(unittest.TestCase):
def setUp(self, seed=1234):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Set pytorch print precision
torch.set_printoptions(precision=10)
def gen_test_inputs(self, N, T, H, smoothing, padding_idx):
logits = torch.randn((N*T, H), dtype=torch.half, device='cuda',
requires_grad=True)
labels = torch.randint(0, H, [N*T], device='cuda')
for i in random.sample(range(N*T), N*T//6):
labels[i] = padding_idx
half_to_float = (logits.dtype == torch.half)
return logits, labels, half_to_float
def print_max_diff_elem(self, ref, tst):
ref, tst = ref.flatten(), tst.flatten()
diff = (ref - tst).abs().max()
idx = (ref - tst).abs().argmax()
print("Max atol idx: {}, diff: {:.6f}, ref: {:.6f}, tst: {:.6f}".format(
idx, diff, ref[idx], tst[idx]))
def test_label_smoothing_function(self):
# Set label smoothing configuration
smoothing, padding_idx = 0.1, 0
N, T, H = 128, 74, 32320
iters = 10
loss_func = label_smoothing.SoftmaxCrossEntropyLoss.apply
for i in range(iters):
logits, labels, half_to_float = self.gen_test_inputs(
N, T, H, smoothing, padding_idx)
# Run original softmax cross entropy with label smoothing
logits.grad = None
losses = label_smoothing_raw(logits, labels, padding_idx, smoothing)
loss = losses.sum()
loss.backward()
ref_loss = loss.clone().detach()
ref_grad = logits.grad.clone().detach()
# Run optimized softmax cross entropy with label smoothing
logits.grad = None
losses = loss_func(logits, labels, smoothing, padding_idx, half_to_float)
loss = losses.sum()
loss.backward()
val_loss = loss.clone().detach()
val_grad = logits.grad.clone().detach()
# Validate
self.print_max_diff_elem(ref_grad, val_grad)
self.assertTrue(torch.allclose(ref_loss, val_loss, atol=1e-5, rtol=1e-5))
self.assertTrue(torch.allclose(ref_grad, val_grad, atol=1e-5, rtol=1e-5))
def test_label_smoothing_perf(self):
# Set label smoothing configuration
smoothing, padding_idx = 0.1, 0
N, T, H = 128, 74, 32320
iters = 1000
loss_func = label_smoothing.SoftmaxCrossEntropyLoss.apply
print()
logits, labels, half_to_float = self.gen_test_inputs(
N, T, H, smoothing, padding_idx)
# Run original softmax cross entropy with label smoothing
torch.cuda.synchronize()
ts = time.time()
for i in range(iters):
logits.grad = None
losses = label_smoothing_raw(logits, labels, padding_idx, smoothing)
loss = losses.sum() / N
loss.backward()
torch.cuda.synchronize()
print("Raw time {:.2f} s elapsed for {} iterations, norm {:.4f}".format(
time.time() - ts, iters, logits.grad.norm()))
# Run optimized softmax cross entropy with label smoothing
torch.cuda.synchronize()
ts = time.time()
for i in range(iters):
logits.grad = None
losses = loss_func(logits, labels, smoothing, padding_idx, half_to_float)
loss = losses.sum() / N
loss.backward()
torch.cuda.synchronize()
print("Opt time {:.2f} s elapsed for {} iterations, norm {:.4f}".format(
time.time() - ts, iters, logits.grad.norm()))
if __name__ == '__main__':
unittest.main()
try:
import torch
import xentropy_cuda
from .softmax_xentropy import SoftmaxCrossEntropyLoss
del torch
del xentropy_cuda
del softmax_xentropy
except ImportError as err:
print("apex was installed without --xentropy flag, contrib.xentropy is not available")
import torch
import xentropy_cuda
class SoftmaxCrossEntropyLoss(torch.autograd.Function):
@staticmethod
def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False):
losses, max_log_sum_exp = xentropy_cuda.forward(
logits, labels, smoothing, half_to_float)
losses.masked_fill_(labels==padding_idx, 0)
ctx.save_for_backward(logits, max_log_sum_exp, labels,
torch.FloatTensor([smoothing]),
torch.LongTensor([padding_idx]))
return losses
@staticmethod
def backward(ctx, grad_loss):
logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors
if not grad_loss.is_contiguous():
grad_loss = grad_loss.contiguous()
grad_loss.masked_fill_(labels==padding_idx.item(), 0)
grad_logits = xentropy_cuda.backward(
grad_loss.contiguous(), logits, max_log_sum_exp,
labels, smoothing.item())
return grad_logits, None, None, None, None
...@@ -107,7 +107,7 @@ if "--bnp" in sys.argv: ...@@ -107,7 +107,7 @@ if "--bnp" in sys.argv:
cmdclass['build_ext'] = BuildExtension cmdclass['build_ext'] = BuildExtension
if torch.utils.cpp_extension.CUDA_HOME is None: if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") raise RuntimeError("--bnp was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else: else:
# Set up macros for forward/backward compatibility hack around # Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
...@@ -128,6 +128,29 @@ if "--bnp" in sys.argv: ...@@ -128,6 +128,29 @@ if "--bnp" in sys.argv:
'-gencode', '-gencode',
'arch=compute_70,code=sm_70'] + version_ge_1_1})) 'arch=compute_70,code=sm_70'] + version_ge_1_1}))
if "--xentropy" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--xentropy")
from torch.utils.cpp_extension import BuildExtension
cmdclass['build_ext'] = BuildExtension
if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--xentropy was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else:
# Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
ext_modules.append(
CUDAExtension(name='xentropy_cuda',
sources=['apex/contrib/csrc/xentropy/interface.cpp',
'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
include_dirs=['csrc'],
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
'nvcc':['-O3'] + version_ge_1_1}))
setup( setup(
name='apex', name='apex',
version='0.1', version='0.1',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment