Commit 0bfb8300 authored by Thor Johnsen's avatar Thor Johnsen
Browse files

Merge

parents 2619f1cb cf50dc7c
...@@ -146,7 +146,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs ...@@ -146,7 +146,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
from .amp import init as amp_init from .amp import init as amp_init
optimizers_was_list = False optimizers_was_list = False
if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in sys.modules and isinstance(optimizers, LARC)): if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
optimizers = [optimizers] optimizers = [optimizers]
elif optimizers is None: elif optimizers is None:
optimizers = [] optimizers = []
......
...@@ -87,7 +87,7 @@ def scale_loss(loss, ...@@ -87,7 +87,7 @@ def scale_loss(loss,
yield loss yield loss
return return
if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in sys.modules and isinstance(optimizers, LARC)): if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
optimizers = [optimizers] optimizers = [optimizers]
loss_scaler = _amp_state.loss_scalers[loss_id] loss_scaler = _amp_state.loss_scalers[loss_id]
......
#include <ATen/ATen.h> #include <ATen/ATen.h>
#ifdef OLD_GENERATOR
#include <ATen/CUDAGenerator.h> #include <ATen/CUDAGenerator.h>
#else
#include <ATen/CUDAGeneratorImpl.h>
#endif
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <curand_kernel.h> #include <curand_kernel.h>
...@@ -206,8 +212,13 @@ void apex_fused_dropout_cuda(scalar_t const *inputs, ...@@ -206,8 +212,13 @@ void apex_fused_dropout_cuda(scalar_t const *inputs,
std::pair<uint64_t, uint64_t> rng_engine_inputs; std::pair<uint64_t, uint64_t> rng_engine_inputs;
{ {
// See Note [Acquire lock when using random generators] // See Note [Acquire lock when using random generators]
#ifdef OLD_GENERATOR
std::lock_guard<std::mutex> lock(gen->mutex_); std::lock_guard<std::mutex> lock(gen->mutex_);
rng_engine_inputs = gen->philox_engine_inputs(counter_offset); rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
#else
std::lock_guard<std::mutex> lock(gen.mutex());
rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(counter_offset);
#endif
} }
apex_fused_dropout_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, outputs, mask, totalElements, p, rng_engine_inputs); apex_fused_dropout_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, outputs, mask, totalElements, p, rng_engine_inputs);
...@@ -239,8 +250,13 @@ void apex_dropout_add_cuda(scalar_t const *inputs, ...@@ -239,8 +250,13 @@ void apex_dropout_add_cuda(scalar_t const *inputs,
std::pair<uint64_t, uint64_t> rng_engine_inputs; std::pair<uint64_t, uint64_t> rng_engine_inputs;
{ {
// See Note [Acquire lock when using random generators] // See Note [Acquire lock when using random generators]
#ifdef OLD_GENERATOR
std::lock_guard<std::mutex> lock(gen->mutex_); std::lock_guard<std::mutex> lock(gen->mutex_);
rng_engine_inputs = gen->philox_engine_inputs(counter_offset); rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
#else
std::lock_guard<std::mutex> lock(gen.mutex());
rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(counter_offset);
#endif
} }
apex_dropout_add_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, add_inputs, outputs, mask, totalElements, p, rng_engine_inputs); apex_dropout_add_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, add_inputs, outputs, mask, totalElements, p, rng_engine_inputs);
......
...@@ -1033,4 +1033,3 @@ void fused_maybe_adam_undo_cuda( ...@@ -1033,4 +1033,3 @@ void fused_maybe_adam_undo_cuda(
} }
THCudaCheck(cudaGetLastError()); THCudaCheck(cudaGetLastError());
} }
#include <torch/extension.h>
void multi_tensor_lamb_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int bias_correction,
const float weight_decay,
const int grad_averaging,
const int mode,
const float global_grad_norm,
const float max_grad_norm);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer");
}
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
typedef enum{
MOMENT_MODE_0 =0, // L2 regularization mode
MOMENT_MODE_1 =1 // Decoupled weight decay mode
} adamMode_t;
std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
at::optional<bool> per_tensor_python);
using MATH_T = float;
template<typename T>
struct LAMBStage1Functor
{
__device__ __forceinline__ void operator()(
int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<4>& tl,
const float beta1,
const float beta2,
const float beta3,
const float beta1_correction,
const float beta2_correction,
const float epsilon,
adamMode_t mode,
const float decay,
const float global_grad_norm,
const float max_global_grad_norm)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
float clipped_global_grad_norm = global_grad_norm > max_global_grad_norm ? global_grad_norm / max_global_grad_norm : 1.0f;
T* g = (T*)tl.addresses[0][tensor_loc];
g += chunk_idx*chunk_size;
T* p = (T*)tl.addresses[1][tensor_loc];
p += chunk_idx*chunk_size;
T* m = (T*)tl.addresses[2][tensor_loc];
m += chunk_idx*chunk_size;
T* v = (T*)tl.addresses[3][tensor_loc];
v += chunk_idx*chunk_size;
n -= chunk_idx*chunk_size;
// see note in multi_tensor_scale_kernel.cu
for(int i_start = 0;
i_start < n && i_start < chunk_size;
i_start += blockDim.x*ILP)
{
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
r_g[ii] = g[i];
// special ?optimization? for lamb stage 1
if (decay == 0) {
r_p[ii] = MATH_T(0);
}
else {
r_p[ii] = p[i];
}
r_m[ii] = m[i];
r_v[ii] = v[i];
} else {
r_g[ii] = MATH_T(0);
r_p[ii] = MATH_T(0);
r_m[ii] = MATH_T(0);
r_v[ii] = MATH_T(0);
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
if (mode == MOMENT_MODE_0) {
MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
// L2 on scaled grad
scaled_grad = scaled_grad + decay*r_p[ii];
r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
r_p[ii] = next_m_unbiased / denom;
}
else {
MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
g[i] = r_p[ii];
m[i] = r_m[ii];
v[i] = r_v[ii];
}
}
}
}
};
// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
template<typename T>
struct LAMBStage2Functor
{
__device__ __forceinline__ void operator()(
int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<2>& tl,
const float* per_tensor_param_norm,
const float* per_tensor_update_norm,
const float learning_rate,
const float decay)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int tensor_num = tl.start_tensor_this_launch + tensor_loc;
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
MATH_T ratio = learning_rate;
// apply adaptive learning rate to parameters with non-zero weight decay
if (decay != 0.0)
{
float param_norm = per_tensor_param_norm[tensor_num];
float update_norm = per_tensor_update_norm[tensor_num];
ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
}
T* update = (T*)tl.addresses[0][tensor_loc];
update += chunk_idx*chunk_size;
T* p = (T*)tl.addresses[1][tensor_loc];
p += chunk_idx*chunk_size;
n -= chunk_idx*chunk_size;
for(int i_start = 0;
i_start < n && i_start < chunk_size;
i_start += blockDim.x*ILP)
{
MATH_T r_p[ILP];
MATH_T r_update[ILP];
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
r_p[ii] = p[i];
r_update[ii] = update[i];
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
p[i] = r_p[ii];
}
}
}
}
};
void multi_tensor_lamb_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int bias_correction,
const float weight_decay,
const int grad_averaging,
const int mode,
const float global_grad_norm,
const float max_grad_norm)
{
using namespace at;
// Master weight and 32bit momentum(potentially changing) is not handled by this
// So we assume every tensor are all in the same type
// Handle bias correction mode
float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
if (bias_correction == 1) {
bias_correction1 = 1 - std::pow(beta1, step);
bias_correction2 = 1 - std::pow(beta2, step);
}
// Handle grad averaging mode
float beta3 = 1.0f;
if (grad_averaging == 1) beta3 = 1 - beta1;
std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin()+1);
std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin()+1, tensor_lists.begin()+2);
// Compute per tensor param norm
auto param_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, param_list, true);
// We now in-place modify grad to store update before compute its norm
// Generally this is not a issue since people modify grad in step() method all the time
// We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
multi_tensor_apply<4>(
BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
LAMBStage1Functor<scalar_t_0>(),
beta1,
beta2,
beta3, // 1-beta1 or 1 depends on averaging mode
bias_correction1,
bias_correction2,
epsilon,
(adamMode_t) mode,
weight_decay,
global_grad_norm,
max_grad_norm); )
// Compute update norms
auto update_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, grad_list, true);
std::vector<std::vector<at::Tensor>> grad_param_list(tensor_lists.begin(), tensor_lists.begin()+2);
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
multi_tensor_apply<2>(
BLOCK_SIZE,
chunk_size,
noop_flag,
grad_param_list,
LAMBStage2Functor<scalar_t_0>(),
std::get<1>(param_norm_tuple).DATA_PTR<float>(),
std::get<1>(update_norm_tuple).DATA_PTR<float>(),
lr,
weight_decay); )
AT_CUDA_CHECK(cudaGetLastError());
}
/** /**
* From PyTorch: * From PyTorch:
* *
* Copyright (c) 2016- Facebook, Inc (Adam Paszke) * Copyright (c) 2016- Facebook, Inc (Adam Paszke)
* Copyright (c) 2014- Facebook, Inc (Soumith Chintala) * Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
...@@ -10,54 +10,54 @@ ...@@ -10,54 +10,54 @@
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio) * Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
* *
* From Caffe2: * From Caffe2:
* *
* Copyright (c) 2016-present, Facebook Inc. All rights reserved. * Copyright (c) 2016-present, Facebook Inc. All rights reserved.
* *
* All contributions by Facebook: * All contributions by Facebook:
* Copyright (c) 2016 Facebook Inc. * Copyright (c) 2016 Facebook Inc.
* *
* All contributions by Google: * All contributions by Google:
* Copyright (c) 2015 Google Inc. * Copyright (c) 2015 Google Inc.
* All rights reserved. * All rights reserved.
* *
* All contributions by Yangqing Jia: * All contributions by Yangqing Jia:
* Copyright (c) 2015 Yangqing Jia * Copyright (c) 2015 Yangqing Jia
* All rights reserved. * All rights reserved.
* *
* All contributions from Caffe: * All contributions from Caffe:
* Copyright(c) 2013, 2014, 2015, the respective contributors * Copyright(c) 2013, 2014, 2015, the respective contributors
* All rights reserved. * All rights reserved.
* *
* All other contributions: * All other contributions:
* Copyright(c) 2015, 2016 the respective contributors * Copyright(c) 2015, 2016 the respective contributors
* All rights reserved. * All rights reserved.
* *
* Caffe2 uses a copyright model similar to Caffe: each contributor holds * Caffe2 uses a copyright model similar to Caffe: each contributor holds
* copyright over their contributions to Caffe2. The project versioning records * copyright over their contributions to Caffe2. The project versioning records
* all such contribution and copyright details. If a contributor wants to further * all such contribution and copyright details. If a contributor wants to further
* mark their specific copyright on a particular contribution, they should * mark their specific copyright on a particular contribution, they should
* indicate their copyright solely in the commit message of the change when it is * indicate their copyright solely in the commit message of the change when it is
* committed. * committed.
* *
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
* *
* 1. Redistributions of source code must retain the above copyright * 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* *
* 2. Redistributions in binary form must reproduce the above copyright * 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the * notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution. * documentation and/or other materials provided with the distribution.
* *
* 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America * 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
* and IDIAP Research Institute nor the names of its contributors may be * and IDIAP Research Institute nor the names of its contributors may be
* used to endorse or promote products derived from this software without * used to endorse or promote products derived from this software without
* specific prior written permission. * specific prior written permission.
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
...@@ -70,7 +70,6 @@ ...@@ -70,7 +70,6 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
...@@ -84,6 +83,8 @@ ...@@ -84,6 +83,8 @@
#include "type_shim.h" #include "type_shim.h"
#include "compat.h" #include "compat.h"
#define ALIGN_BYTES 16
using Tensor = at::Tensor; using Tensor = at::Tensor;
using TensorList = at::TensorList; using TensorList = at::TensorList;
using ScalarType = at::ScalarType; using ScalarType = at::ScalarType;
...@@ -123,7 +124,7 @@ const int max_threads = 1024; ...@@ -123,7 +124,7 @@ const int max_threads = 1024;
inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
uint64_t block_size = 1; uint64_t block_size = 1;
uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads)); uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
while (block_size < max_block_size) block_size *= 2; while (block_size < (max_block_size/2)) block_size *= 2;
// Launch at least a single warp - the kernel assumes that. // Launch at least a single warp - the kernel assumes that.
block_size = std::max(block_size, static_cast<uint64_t>(32)); block_size = std::max(block_size, static_cast<uint64_t>(32));
return dim3(block_size); return dim3(block_size);
...@@ -287,29 +288,40 @@ blockReduce(AccumT* smem, ...@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT> template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
__device__ __forceinline__ AccumT __device__ __forceinline__ AccumT
ilpReduce(T* data, ilpReduce(int shift,
T* data,
int size, int size,
const Reduction<T, AccumT>& r, const Reduction<T, AccumT>& r,
AccumT defaultVal) AccumT defaultVal)
{ {
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
AccumT threadVal = defaultVal; AccumT threadVal = defaultVal;
int offset = threadIdx.x; int offset = threadIdx.x;
// shift and do 1
if(shift > 0){
data -= shift;
size += shift;
if(threadIdx.x >= shift){
threadVal = r(threadVal, data[offset]);
}
size -= blockDim.x;
data += blockDim.x;
}
int last = size % (ILP * blockDim.x); int last = size % (ILP * blockDim.x);
// Body (unroll by ILP times) T v[ILP];
for (; offset < size - last; offset += blockDim.x * ILP) { LoadT* value = reinterpret_cast<LoadT*>(&v);
T tmp[ILP];
#pragma unroll for (; offset * ILP < (size - last); offset += blockDim.x) {
for (int j = 0; j < ILP; ++j) *value = reinterpret_cast<LoadT*>(data)[offset];
tmp[j] = data[offset + j * blockDim.x];
#pragma unroll for (int j = 0; j < ILP; ++j) {
for (int j = 0; j < ILP; ++j) threadVal = r(threadVal, v[j]);
threadVal = r(threadVal, tmp[j]); }
} }
offset = size - last + threadIdx.x;
// Epilogue // Epilogue
for (; offset < size; offset += blockDim.x) for (; offset < size; offset += blockDim.x)
threadVal = r(threadVal, data[offset]); threadVal = r(threadVal, data[offset]);
...@@ -319,7 +331,8 @@ ilpReduce(T* data, ...@@ -319,7 +331,8 @@ ilpReduce(T* data,
template <template<typename, typename> class Reduction1, template<typename, typename> class Reduction2, int ILP, typename T, typename AccumT> template <template<typename, typename> class Reduction1, template<typename, typename> class Reduction2, int ILP, typename T, typename AccumT>
__device__ __forceinline__ void __device__ __forceinline__ void
ilpReduce(T* data, ilpReduce(int shift,
T* data,
int size, int size,
AccumT* reducVal1, AccumT* reducVal1,
const Reduction1<T, AccumT>& r1, const Reduction1<T, AccumT>& r1,
...@@ -328,27 +341,38 @@ ilpReduce(T* data, ...@@ -328,27 +341,38 @@ ilpReduce(T* data,
const Reduction2<T, AccumT>& r2, const Reduction2<T, AccumT>& r2,
AccumT defaultVal2) AccumT defaultVal2)
{ {
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
AccumT threadVal1 = defaultVal1; AccumT threadVal1 = defaultVal1;
AccumT threadVal2 = defaultVal2; AccumT threadVal2 = defaultVal2;
int offset = threadIdx.x; int offset = threadIdx.x;
// shift and do 1
if(shift > 0){
data -= shift;
size += shift;
if(threadIdx.x >= shift){
threadVal1 = r1(threadVal1, data[offset]);
threadVal2 = r2(threadVal2, data[offset]);
}
size -= blockDim.x;
data += blockDim.x;
}
int last = size % (ILP * blockDim.x); int last = size % (ILP * blockDim.x);
// Body (unroll by ILP times) T v[ILP];
for (; offset < size - last; offset += blockDim.x * ILP) { LoadT* value = reinterpret_cast<LoadT*>(&v);
T tmp[ILP];
#pragma unroll for (; offset * ILP < (size - last); offset += blockDim.x) {
for (int j = 0; j < ILP; ++j) *value = reinterpret_cast<LoadT*>(data)[offset];
tmp[j] = data[offset + j * blockDim.x];
#pragma unroll
for (int j = 0; j < ILP; ++j) { for (int j = 0; j < ILP; ++j) {
threadVal1 = r1(threadVal1, tmp[j]); threadVal1 = r1(threadVal1, v[j]);
threadVal2 = r2(threadVal2, tmp[j]); threadVal2 = r2(threadVal2, v[j]);
} }
} }
offset = size - last + threadIdx.x;
// Epilogue // Epilogue
for (; offset < size; offset += blockDim.x) { for (; offset < size; offset += blockDim.x) {
threadVal1 = r1(threadVal1, data[offset]); threadVal1 = r1(threadVal1, data[offset]);
...@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward( ...@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
// each block handles a sample in the mini-batch // each block handles a sample in the mini-batch
input += blockIdx.x * classes; input += blockIdx.x * classes;
//output += blockIdx.x * classes; //output += blockIdx.x * classes;
const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
int64_t label = labels[blockIdx.x]; int64_t label = labels[blockIdx.x];
// find the max and sum // find the max and sum
accscalar_t threadMax, threadSum, max_k, sum_k; accscalar_t threadMax, threadSum, max_k, sum_k;
ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>( ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>(
input, classes, shift, input, classes,
&threadMax, MaxFloat<scalar_t, accscalar_t>(), &threadMax, MaxFloat<scalar_t, accscalar_t>(),
-at::numeric_limits<accscalar_t>::max(), -at::numeric_limits<accscalar_t>::max(),
&threadSum, AddFloat<scalar_t, accscalar_t>(), &threadSum, AddFloat<scalar_t, accscalar_t>(),
static_cast<accscalar_t>(0)); static_cast<accscalar_t>(0));
blockReduce<Max, Add, accscalar_t>( blockReduce<Max, Add, accscalar_t>(
sdata, sdata,
&max_k, threadMax, Max<accscalar_t>(), &max_k, threadMax, Max<accscalar_t>(),
...@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward( ...@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
&sum_k, threadSum, Add<accscalar_t>(), &sum_k, threadSum, Add<accscalar_t>(),
static_cast<accscalar_t>(0)); static_cast<accscalar_t>(0));
// reduce all values accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
accscalar_t sumAll = blockReduce<Add, accscalar_t>( accscalar_t sumAll = blockReduce<Add, accscalar_t>(
sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0)); sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
...@@ -411,20 +435,16 @@ cunn_SoftMaxXEntropyForward( ...@@ -411,20 +435,16 @@ cunn_SoftMaxXEntropyForward(
} }
} }
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue> template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
__global__ void __device__ __forceinline__ void
cunn_SoftMaxXEntropyBackward( apply(scalar_t *gradInput,
scalar_t *gradInput, scalar_t *logits,
scalar_t *logits, outscalar_t *max_log_sum_exp,
outscalar_t *max_log_sum_exp, outscalar_t *gradOutput,
outscalar_t *gradOutput, int64_t *labels,
int64_t *labels, const float smoothing,
const float smoothing, int classes)
int classes)
{ {
gradInput += blockIdx.x * classes;
logits += blockIdx.x * classes;
accscalar_t smooth_positives = 1.0 - smoothing; accscalar_t smooth_positives = 1.0 - smoothing;
accscalar_t smooth_negatives = smoothing / classes; accscalar_t smooth_negatives = smoothing / classes;
accscalar_t tmpGradOutput = gradOutput[blockIdx.x]; accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
...@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward( ...@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
int offset = threadIdx.x; int offset = threadIdx.x;
int last = classes % (ILP * blockDim.x); int last = classes % (ILP * blockDim.x);
for (; offset < classes - last; offset += blockDim.x * ILP) { for (; offset < classes - last; offset += blockDim.x * ILP) {
accscalar_t tmpLogits[ILP]; accscalar_t tmpLogits[ILP];
...@@ -444,22 +465,112 @@ cunn_SoftMaxXEntropyBackward( ...@@ -444,22 +465,112 @@ cunn_SoftMaxXEntropyBackward(
#pragma unroll #pragma unroll
for (int j = 0; j < ILP; ++j) for (int j = 0; j < ILP; ++j)
gradInput[offset + j * blockDim.x] = tmpGradOutput * ( gradInput[offset + j * blockDim.x] = tmpGradOutput * (
std::exp(tmpLogits[j] - coeff) - static_cast<accscalar_t>( std::exp(tmpLogits[j] - coeff) - static_cast<accscalar_t>(
(offset + j * blockDim.x == label) ? 1 : 0) * (offset + j * blockDim.x == label) ? 1 : 0) *
smooth_positives - smooth_negatives); smooth_positives - smooth_negatives);
} }
for (; offset < classes; offset += blockDim.x) for (; offset < classes; offset += blockDim.x)
gradInput[offset] = tmpGradOutput * (std::exp( gradInput[offset] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(logits[offset]) - coeff) - static_cast<accscalar_t>(logits[offset]) - coeff) -
static_cast<accscalar_t>((offset == label) ? 1 : 0) * static_cast<accscalar_t>((offset == label) ? 1 : 0) *
smooth_positives - smooth_negatives); smooth_positives - smooth_negatives);
} }
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
__device__ __forceinline__ void
aligned_apply(int shift,
scalar_t *gradInput,
scalar_t *logits,
outscalar_t *max_log_sum_exp,
outscalar_t *gradOutput,
int64_t *labels,
const float smoothing,
int classes)
{
accscalar_t smooth_positives = 1.0 - smoothing;
accscalar_t smooth_negatives = smoothing / classes;
accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
int64_t label = labels[blockIdx.x];
accscalar_t coeff = max_log_sum_exp[blockIdx.x];
int offset = threadIdx.x;
// shift and do 1
if(shift > 0){
logits -= shift;
gradInput -= shift;
classes += shift;
if(threadIdx.x >= shift){
gradInput[offset] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(logits[offset]) - coeff) -
static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
classes -= blockDim.x;
gradInput += blockDim.x;
logits += blockDim.x;
shift -= blockDim.x;
}
int last = classes % (ILP * blockDim.x);
typedef typename std::aligned_storage<ILP*sizeof(scalar_t), ILP*alignof(scalar_t)>::type LoadT;
// input
scalar_t v[ILP];
LoadT* value = reinterpret_cast<LoadT*>(&v);
// output
scalar_t r[ILP];
LoadT* result = reinterpret_cast<LoadT*>(&r);
for (; offset * ILP < (classes - last); offset += blockDim.x) {
*value = reinterpret_cast<LoadT*>(logits)[offset];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
r[j] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(v[j]) - coeff) -
static_cast<accscalar_t>(((ILP * offset + j - shift) == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
reinterpret_cast<LoadT*>(gradInput)[offset] = *result;
}
offset = classes - last + threadIdx.x;
for (; offset < classes; offset += blockDim.x)
gradInput[offset] = tmpGradOutput * (std::exp(
static_cast<accscalar_t>(logits[offset]) - coeff) -
static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
smooth_positives - smooth_negatives);
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
__global__ void
cunn_SoftMaxXEntropyBackward(
scalar_t *gradInput,
scalar_t *logits,
outscalar_t *max_log_sum_exp,
outscalar_t *gradOutput,
int64_t *labels,
const float smoothing,
int classes)
{
gradInput += blockIdx.x * classes;
logits += blockIdx.x * classes;
// Do vectorized load/store when input/output have same alignment
const int shift = ((uint64_t)logits) % ALIGN_BYTES / sizeof(scalar_t);
const int shift_ = ((uint64_t)gradInput) % ALIGN_BYTES / sizeof(scalar_t);
if (shift == shift_){
aligned_apply<ILP, scalar_t, accscalar_t, outscalar_t>(shift, gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes);
}
else {
apply<ILP, scalar_t, accscalar_t, outscalar_t>(gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes);
}
}
template<template<typename, typename, typename> class Epilogue> template<template<typename, typename, typename> class Epilogue>
std::vector<Tensor> host_softmax_xentropy( std::vector<Tensor> host_softmax_xentropy(
...@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy( ...@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
// XXX: it assumes that inner_size == 1 // XXX: it assumes that inner_size == 1
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported"); TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size); dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
using namespace at; using namespace at;
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "host_softmax_xentropy", DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "host_softmax_xentropy",
using accscalar_t = at::acc_type<scalar_t_0, true>; using accscalar_t = at::acc_type<scalar_t_0, true>;
const int ILP = sizeof(float4)/sizeof(scalar_t_0);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
if (!half_to_float) { if (!half_to_float) {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue> cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>( <<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
...@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward( ...@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported"); TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size); dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
DISPATCH_FLOAT_AND_HALF(gI.scalar_type(), 0, "host_softmax_xentropy_backward", DISPATCH_FLOAT_AND_HALF(gI.scalar_type(), 0, "host_softmax_xentropy_backward",
using accscalar_t = acc_type<scalar_t_0, true>; using accscalar_t = acc_type<scalar_t_0, true>;
const int ILP = sizeof(float4)/sizeof(scalar_t_0);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
if (!half_to_float) { if (!half_to_float) {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue> cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>( <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
......
...@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function): ...@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
values_grads = torch.bmm(dropout_results.transpose(1,2), output_lin_grads, out=values_grads.transpose(0,1)) values_grads = torch.bmm(dropout_results.transpose(1,2), output_lin_grads, out=values_grads.transpose(0,1))
# Mask and Scaling for Dropout (not a publically documented op) # Mask and Scaling for Dropout (not a publically documented op)
dropout_grads = torch._masked_scale(matmul2_dgrad1, dropout_mask, dropout_prob_t[0]) dropout_grads = torch._masked_scale(matmul2_dgrad1, dropout_mask, 1.0/(1.0-dropout_prob_t[0]))
# Softmax Grad (not a publically documented op) # Softmax Grad (not a publically documented op)
softmax_grads = torch._softmax_backward_data(dropout_grads, softmax_results, -1, softmax_results) softmax_grads = torch._softmax_backward_data(dropout_grads, softmax_results, -1, softmax_results)
......
from .fp16_optimizer import FP16_Optimizer from .fp16_optimizer import FP16_Optimizer
from .fused_adam import FusedAdam from .fused_adam import FusedAdam
from .fused_lamb import FusedLAMB
...@@ -239,4 +239,5 @@ class FP16_Optimizer(object): ...@@ -239,4 +239,5 @@ class FP16_Optimizer(object):
# constructed in the same way as the one whose state_dict we are loading, the same master params # constructed in the same way as the one whose state_dict we are loading, the same master params
# are guaranteed to exist, so we can just copy_() from the saved master params. # are guaranteed to exist, so we can just copy_() from the saved master params.
for current, saved in zip(self.fp32_groups, state_dict['fp32_groups']): for current, saved in zip(self.fp32_groups, state_dict['fp32_groups']):
current.data.copy_(saved.data) for _current, _saved in zip(current, saved):
_current.data.copy_(_saved.data)
import torch
import importlib
import math
from apex.multi_tensor_apply import multi_tensor_applier
class FusedLAMB(torch.optim.Optimizer):
"""Implements LAMB algorithm.
Currently GPU-only. Requires Apex to be installed via
``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_lamb" ./``.
This version of fused LAMB implements 2 fusions.
* Fusion of the LAMB update's elementwise operations
* A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
:class:`apex.contrib.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::
opt = apex.contrib.optimizers.FusedLAMB(model.parameters(), lr = ....)
...
opt.step()
:class:`apex.optimizers.FusedLAMB` may be used with or without Amp. If you wish to use :class:`FusedLAMB` with Amp,
you may choose any ``opt_level``::
opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
...
opt.step()
In general, ``opt_level="O1"`` is recommended.
LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its norm. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
NOT SUPPORTED now! (default: False)
adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
True for decoupled weight decay(also known as AdamW) (default: True)
grad_averaging (bool, optional): whether apply (1-beta2) to grad when
calculating running averages of gradient. (default: True)
set_grad_none (bool, optional): whether set grad to None when zero_grad()
method is called. (default: True)
max_grad_norm (float, optional): value used to clip global grad norm
(default: 1.0)
.. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
https://arxiv.org/abs/1904.00962
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, bias_correction=True,
betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01,
amsgrad=False, adam_w_mode=True,
grad_averaging=True, set_grad_none=True,
max_grad_norm=1.0):
if amsgrad:
raise RuntimeError('FusedLAMB does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay,
grad_averaging=grad_averaging,
max_grad_norm=max_grad_norm)
super(FusedLAMB, self).__init__(params, defaults)
if multi_tensor_applier.available:
import amp_C
self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
fused_lamb_cuda = importlib.import_module("fused_lamb_cuda")
self.multi_tensor_lamb = fused_lamb_cuda.lamb
else:
raise RuntimeError('apex.contrib.optimizers.FusedLAMB requires cuda extensions')
self.adam_w_mode = 1 if adam_w_mode else 0
self.set_grad_none = set_grad_none
def zero_grad(self):
if self.set_grad_none:
for group in self.param_groups:
for p in group['params']:
p.grad = None
else:
super(FusedLAMB, self).zero_grad()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
# create separate grad lists for fp32 and fp16 params
g_all_32, g_all_16 = [], []
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
if p.dtype == torch.float32:
g_all_32.append(p.grad.data)
elif p.dytpe == torch.float16:
g_all_16.append(p.grad.data)
else:
raise RuntimeError('FusedLAMB only support fp16 and fp32.')
g_norm_32, g_norm_16 = 0.0, 0.0
# compute grad norm for two lists
if len(g_all_32) > 0:
g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
self._dummy_overflow_buf,
[g_all_32], False)[0].item()
if len(g_all_16) > 0:
g_norm_16 = multi_tensor_applier(self.multi_tensor_l2norm,
self._dummy_overflow_buf,
[g_all_16], False)[0].item()
# blend two grad norms to get global grad norm
global_grad_norm = math.sqrt(g_norm_32 * g_norm_32 + g_norm_16 * g_norm_16)
max_grad_norm = self.defaults['max_grad_norm']
for group in self.param_groups:
bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
grad_averaging = 1 if group['grad_averaging'] else 0
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
# create lists for multi-tensor apply
g_16, p_16, m_16, v_16 = [], [], [], []
g_32, p_32, m_32, v_32 = [], [], [], []
for p in group['params']:
if p.grad is None:
continue
if p.grad.data.is_sparse:
raise RuntimeError('FusedLAMB does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if p.dtype == torch.float16:
g_16.append(p.grad.data)
p_16.append(p.data)
m_16.append(state['exp_avg'])
v_16.append(state['exp_avg_sq'])
elif p.dtype == torch.float32:
g_32.append(p.grad.data)
p_32.append(p.data)
m_32.append(state['exp_avg'])
v_32.append(state['exp_avg_sq'])
else:
raise RuntimeError('FusedLAMB only support fp16 and fp32.')
if(len(g_16) > 0):
multi_tensor_applier(self.multi_tensor_lamb,
self._dummy_overflow_buf,
[g_16, p_16, m_16, v_16],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
bias_correction,
group['weight_decay'],
grad_averaging,
self.adam_w_mode,
global_grad_norm,
max_grad_norm)
if(len(g_32) > 0):
multi_tensor_applier(self.multi_tensor_lamb,
self._dummy_overflow_buf,
[g_32, p_32, m_32, v_32],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
bias_correction,
group['weight_decay'],
grad_averaging,
self.adam_w_mode,
global_grad_norm,
max_grad_norm)
return loss
from .mlp import *
from copy import copy
import math
import torch
from torch import nn
import mlp_cuda
from .. import amp
class MlpFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, bias, activation, *args):
output = mlp_cuda.forward(bias, activation, args)
ctx.save_for_backward(*args)
ctx.outputs = output
ctx.bias = bias
ctx.activation = activation
return output[0]
@staticmethod
def backward(ctx, grad_o):
grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors)
del ctx.outputs
return (None, None, *grads)
mlp_function = amp.half_function(MlpFunction.apply)
class MLP(torch.nn.Module):
"""Launch MLP in C++
Args:
mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024
bias (bool): Default True:
relu (bool): Default True
"""
def __init__(self, mlp_sizes, bias=True, activation='relu'):
super(MLP, self).__init__()
self.num_layers = len(mlp_sizes) - 1
self.mlp_sizes = copy(mlp_sizes)
self.bias = 1 if bias else 0
if activation is 'none':
self.activation = 0
elif activation is 'relu':
self.activation = 1
elif activation is 'sigmoid':
self.activation = 2
else:
raise TypeError("activation must be relu or none.")
self.weights = []
self.biases = []
for i in range(self.num_layers):
w = torch.nn.Parameter(torch.empty(mlp_sizes[i+1], mlp_sizes[i]))
self.weights.append(w)
name = 'weight_{}'.format(i)
setattr(self, name, w)
if self.bias:
b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1]))
self.biases.append(b)
name = 'bias_{}'.format(i)
setattr(self, name, b)
self.reset_parameters()
def reset_parameters(self):
for weight in self.weights:
dimsum = weight.size(0) + weight.size(1)
std = math.sqrt(2. / float(dimsum))
nn.init.normal_(weight, 0., std)
if self.bias:
for bias in self.biases:
std = math.sqrt(1. / float(bias.size(0)))
nn.init.normal_(bias, 0., std)
def forward(self, input):
return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases)
def extra_repr(self):
s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}"
return s
...@@ -37,7 +37,6 @@ class LARC(object): ...@@ -37,7 +37,6 @@ class LARC(object):
""" """
def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8): def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8):
self.param_groups = optimizer.param_groups
self.optim = optimizer self.optim = optimizer
self.trust_coefficient = trust_coefficient self.trust_coefficient = trust_coefficient
self.eps = eps self.eps = eps
...@@ -49,9 +48,21 @@ class LARC(object): ...@@ -49,9 +48,21 @@ class LARC(object):
def __setstate__(self, state): def __setstate__(self, state):
self.optim.__setstate__(state) self.optim.__setstate__(state)
@property
def state(self):
return self.optim.state
def __repr__(self): def __repr__(self):
return self.optim.__repr__() return self.optim.__repr__()
@property
def param_groups(self):
return self.optim.param_groups
@param_groups.setter
def param_groups(self, value):
self.optim.param_groups = value
def state_dict(self): def state_dict(self):
return self.optim.state_dict() return self.optim.state_dict()
......
import warnings import warnings
from . import nvtx from . import nvtx, prof
...@@ -9,10 +9,10 @@ class Utility(object): ...@@ -9,10 +9,10 @@ class Utility(object):
@staticmethod @staticmethod
def typeToBytes(t): def typeToBytes(t):
if (t in ["uint8", "int8", "byte", "char"]): if (t in ["uint8", "int8", "byte", "char", "bool"]):
return 1 return 1
elif (t in ["float16", "half", "int16", "short"]): elif (t in ["float16", "half", "int16", "short"]):
return 2 return 2
elif (t in ["float32", "float", "int32", "int"]): elif (t in ["float32", "float", "int32", "int"]):
return 4 return 4
elif (t in ["int64", "long", "float64", "double"]): elif (t in ["int64", "long", "float64", "double"]):
...@@ -21,7 +21,7 @@ class Utility(object): ...@@ -21,7 +21,7 @@ class Utility(object):
@staticmethod @staticmethod
def typeToString(t): def typeToString(t):
if (t in ["uint8", "byte", "char"]): if (t in ["uint8", "byte", "char",]):
return "uint8" return "uint8"
elif (t in ["int8",]): elif (t in ["int8",]):
return "int8" return "int8"
...@@ -37,6 +37,8 @@ class Utility(object): ...@@ -37,6 +37,8 @@ class Utility(object):
return "int64" return "int64"
elif (t in ["float64", "double",]): elif (t in ["float64", "double",]):
return "fp64" return "fp64"
elif (t in ["bool",]):
return "bool"
assert False assert False
@staticmethod @staticmethod
......
#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>
#include <stdio.h>
size_t get_mlp_reserved_space(int batch_size, int num_layers, const int* output_features);
template <typename T>
size_t get_mlp_bp_workspace_in_bytes(int batch_size, int num_layers, const int* output_features);
template <typename T>
int mlp_fp(
T* X,
int input_features,
int batch_size,
T** WPtr,
int num_layers,
int* output_features,
T** BPtr,
T* Y,
T* reserved_space,
int use_bias,
int activation);
template <typename T>
int mlp_bp(
T* X,
T* Y,
int input_features,
int batch_size,
T** WPtr,
int num_layers,
int* output_features,
T* dY,
T* reserved_space,
T* work_space,
T* dX,
T** dwPtr,
T** dbPtr,
bool requires_grad,
int use_bias,
int activation);
std::vector<at::Tensor> mlp_forward(int use_bias, int activation, std::vector<at::Tensor> inputs) {
auto num_layers = inputs.size() - 1;
if (use_bias) {
// inputs contains (input, weights, biases)
num_layers /= 2;
}
auto batch_size = inputs[0].size(0);
auto input_features = inputs[0].size(1);
std::vector<int> output_features;
for (int i = 0; i < num_layers; i++) {
output_features.push_back(inputs[i + 1].size(0));
}
auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());
// create output/workspace tensor
// TODO(deyuf): just get buffer?
auto out = at::empty({batch_size, output_features.back()}, inputs[0].type());
auto reserved_space = at::empty({reserved_size}, inputs[0].type());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_forward", [&] {
std::vector<scalar_t*> w_ptr;
std::vector<scalar_t*> b_ptr;
for (int i = 0; i < num_layers; i++) {
w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
if (use_bias) {
b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr<scalar_t>());
}
}
auto result = mlp_fp<scalar_t>(
inputs[0].data_ptr<scalar_t>(),
input_features,
batch_size,
w_ptr.data(),
num_layers,
output_features.data(),
b_ptr.data(),
out.data_ptr<scalar_t>(),
reserved_space.data_ptr<scalar_t>(),
use_bias,
activation);
});
return {out, reserved_space};
}
std::vector<at::Tensor> mlp_backward(
int use_bias,
int activation,
at::Tensor grad_o,
std::vector<at::Tensor> fprop_outputs,
std::vector<at::Tensor> inputs) {
auto num_layers = inputs.size() - 1;
if (use_bias) {
// inputs contains (input, weights, biases)
num_layers /= 2;
}
auto batch_size = inputs[0].size(0);
auto input_features = inputs[0].size(1);
// TODO: not creating empty tensor for it?
bool requires_grad = inputs[0].requires_grad();
std::vector<int> output_features;
for (int i = 0; i < num_layers; i++) {
output_features.push_back(inputs[i + 1].size(0));
}
// create outputs, length of inputs
// TODO: not create bias if not needed
std::vector<at::Tensor> outputs;
for (int i = 0; i < inputs.size(); i++) {
outputs.push_back(at::empty(inputs[i].sizes(), inputs[i].type())); // clone for testing now
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_backward", [&] {
std::vector<scalar_t*> w_ptr;
for (int i = 0; i < num_layers; i++) {
w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
}
std::vector<scalar_t*> outputs_ptr;
for (int i = 0; i < inputs.size(); i++) {
outputs_ptr.push_back(outputs[i].data_ptr<scalar_t>());
}
auto work_size =
get_mlp_bp_workspace_in_bytes<scalar_t>(batch_size, num_layers, output_features.data());
// auto work_space = at::empty({work_size*4}, at::kByte);
auto work_space = at::empty({work_size / sizeof(scalar_t)}, inputs[0].type());
auto result = mlp_bp<scalar_t>(
inputs[0].data_ptr<scalar_t>(),
fprop_outputs[0].data_ptr<scalar_t>(),
input_features,
batch_size,
w_ptr.data(),
num_layers,
output_features.data(),
grad_o.contiguous().data_ptr<scalar_t>(),
fprop_outputs[1].data_ptr<scalar_t>(),
work_space.data_ptr<scalar_t>(),
outputs_ptr[0],
outputs_ptr.data() + 1,
outputs_ptr.data() + 1 + num_layers,
requires_grad,
use_bias,
activation);
});
return outputs;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &mlp_forward, "MLP forward");
m.def("backward", &mlp_backward, "MLP backward");
}
This diff is collapsed.
...@@ -13,6 +13,17 @@ ...@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
#define ILP 4 #define ILP 4
template<typename T>
__device__ __forceinline__ bool is_aligned(T* p){
return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
}
template<typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}
template<typename x_t, typename y_t, typename out_t> template<typename x_t, typename y_t, typename out_t>
struct AxpbyFunctor struct AxpbyFunctor
{ {
...@@ -43,46 +54,74 @@ struct AxpbyFunctor ...@@ -43,46 +54,74 @@ struct AxpbyFunctor
n -= chunk_idx*chunk_size; n -= chunk_idx*chunk_size;
// Non-divergent exit condition for __syncthreads, not necessary here bool finite = true;
float xs[ILP]; x_t r_x[ILP];
float ys[ILP]; y_t r_y[ILP];
for(int i_start = 0; out_t r_out[ILP];
i_start < n && i_start < chunk_size;
i_start += blockDim.x*ILP) // to make things simple, we put aligned case in a different code path
if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x) && is_aligned(y) && is_aligned(out))
{ {
#pragma unroll for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
for(int ii = 0; ii < ILP; ii++)
{ {
xs[ii] = 0; // load
ys[ii] = 0; load_store(r_x, x, 0 , i_start);
int i = i_start + threadIdx.x + ii*blockDim.x; load_store(r_y, y, 0 , i_start);
if(i < n && i < chunk_size) #pragma unroll
for(int ii = 0; ii < ILP; ii++)
{ {
xs[ii] = static_cast<float>(x[i]); r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
ys[ii] = static_cast<float>(y[i]); if(arg_to_check == -1)
finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
if(arg_to_check == 0)
finite = finite && isfinite(r_x[ii]);
if(arg_to_check == 1)
finite = finite && isfinite(r_y[ii]);
} }
// store
load_store(out, r_out, i_start , 0);
} }
}
// see note in multi_tensor_scale_kernel.cu else
#pragma unroll {
for(int ii = 0; ii < ILP; ii++) // Non-divergent exit condition for __syncthreads, not necessary here
for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
{ {
int i = i_start + threadIdx.x + ii*blockDim.x; #pragma unroll
if(i < n && i < chunk_size) for(int ii = 0; ii < ILP; ii++)
{ {
out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]); r_x[ii] = 0;
bool finite = true; r_y[ii] = 0;
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
r_x[ii] = x[i];
r_y[ii] = y[i];
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
if(arg_to_check == -1) if(arg_to_check == -1)
finite = (isfinite(xs[ii]) && isfinite(ys[ii])); finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
if(arg_to_check == 0) if(arg_to_check == 0)
finite = isfinite(xs[ii]); finite = finite && isfinite(r_x[ii]);
if(arg_to_check == 1) if(arg_to_check == 1)
finite = isfinite(ys[ii]); finite = finite && isfinite(r_y[ii]);
if(!finite) }
*noop_gmem = 1; // Blindly fire off a write. These will race but that's ok. // see note in multi_tensor_scale_kernel.cu
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
out[i] = r_out[ii];
} }
} }
} }
if(!finite)
*noop_gmem = 1; // Blindly fire off a write. These will race but that's ok.
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment