"git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "2a2289fb6b7e01c862a89c2c97dd856f1407c2b5"
Commit 843cdbe0 authored by Michael Carilli's avatar Michael Carilli
Browse files

Merging in master

parents 724672d7 28097c99
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include "ATen/AccumulateType.h" #include "ATen/AccumulateType.h"
#include <THC/THCGeneral.h> #include <THC/THCGeneral.h>
#include "type_shim.h"
typedef enum{ typedef enum{
ADAM_MODE_0 =0, // eps under square root ADAM_MODE_0 =0, // eps under square root
ADAM_MODE_1 =1 // eps outside square root ADAM_MODE_1 =1 // eps outside square root
...@@ -29,8 +31,8 @@ __global__ void adam_cuda_kernel( ...@@ -29,8 +31,8 @@ __global__ void adam_cuda_kernel(
const float step_size, const float step_size,
const size_t tsize, const size_t tsize,
adamMode_t mode, adamMode_t mode,
const float decay) { const float decay)
{
//Assuming 2D grids and 2D blocks //Assuming 2D grids and 2D blocks
const int blockId = gridDim.x * blockIdx.y + blockIdx.x; const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
const int threadsPerBlock = blockDim.x * blockDim.y; const int threadsPerBlock = blockDim.x * blockDim.y;
...@@ -67,7 +69,9 @@ void fused_adam_cuda( ...@@ -67,7 +69,9 @@ void fused_adam_cuda(
int step, int step,
int mode, int mode,
int bias_correction, int bias_correction,
float decay) { float decay)
{
// using namespace at;
//Get tensor size //Get tensor size
int tsize = p.numel(); int tsize = p.numel();
...@@ -91,6 +95,7 @@ void fused_adam_cuda( ...@@ -91,6 +95,7 @@ void fused_adam_cuda(
//all other values should be fp32 for half gradients //all other values should be fp32 for half gradients
AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float, "expected parameter to be of float type"); AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float, "expected parameter to be of float type");
//dispatch is done on the gradient type //dispatch is done on the gradient type
using namespace at; // prevents "toString is undefined" errors
AT_DISPATCH_FLOATING_TYPES_AND_HALF(g.type(), "adam_cuda_kernel", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(g.type(), "adam_cuda_kernel", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
adam_cuda_kernel<accscalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>( adam_cuda_kernel<accscalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>(
...@@ -109,6 +114,7 @@ void fused_adam_cuda( ...@@ -109,6 +114,7 @@ void fused_adam_cuda(
decay); decay);
})); }));
} else { } else {
using namespace at;
AT_DISPATCH_FLOATING_TYPES(g.type(), "adam_cuda_kernel", ([&] { AT_DISPATCH_FLOATING_TYPES(g.type(), "adam_cuda_kernel", ([&] {
adam_cuda_kernel<scalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>( adam_cuda_kernel<scalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>(
p.data<scalar_t>(), p.data<scalar_t>(),
......
...@@ -5,7 +5,11 @@ ...@@ -5,7 +5,11 @@
namespace { namespace {
void compute_n1_n2( void compute_n1_n2(
at::Tensor input, at::Tensor input,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
int& n1, int& n1,
int& n2) int& n2)
{ {
...@@ -22,7 +26,11 @@ void compute_n1_n2( ...@@ -22,7 +26,11 @@ void compute_n1_n2(
} }
void check_args( void check_args(
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor gamma, at::Tensor gamma,
at::Tensor beta at::Tensor beta
) )
...@@ -33,7 +41,11 @@ void check_args( ...@@ -33,7 +41,11 @@ void check_args(
void check_args( void check_args(
at::Tensor input, at::Tensor input,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
int& n1, int& n1,
int& n2 int& n2
) )
...@@ -69,7 +81,11 @@ void check_args( ...@@ -69,7 +81,11 @@ void check_args(
void check_args( void check_args(
at::Tensor input, at::Tensor input,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor gamma, at::Tensor gamma,
at::Tensor beta, at::Tensor beta,
int& n1, int& n1,
...@@ -88,7 +104,11 @@ void cuda_layer_norm( ...@@ -88,7 +104,11 @@ void cuda_layer_norm(
at::Tensor* input, at::Tensor* input,
int n1, int n1,
int n2, int n2,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor* gamma, at::Tensor* gamma,
at::Tensor* beta, at::Tensor* beta,
double epsilon); double epsilon);
...@@ -99,7 +119,11 @@ void cuda_layer_norm( ...@@ -99,7 +119,11 @@ void cuda_layer_norm(
std::vector<at::Tensor> layer_norm( std::vector<at::Tensor> layer_norm(
at::Tensor input, at::Tensor input,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
double epsilon) { double epsilon) {
CHECK_INPUT(input); CHECK_INPUT(input);
int n1,n2; int n1,n2;
...@@ -113,7 +137,11 @@ std::vector<at::Tensor> layer_norm( ...@@ -113,7 +137,11 @@ std::vector<at::Tensor> layer_norm(
} }
std::vector<at::Tensor> layer_norm_affine( std::vector<at::Tensor> layer_norm_affine(
at::Tensor input, at::Tensor input,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor gamma, at::Tensor gamma,
at::Tensor beta, at::Tensor beta,
double epsilon) { double epsilon) {
...@@ -137,7 +165,11 @@ void cuda_layer_norm_gradient( ...@@ -137,7 +165,11 @@ void cuda_layer_norm_gradient(
at::Tensor* input, at::Tensor* input,
int n1, int n1,
int n2, int n2,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor* gamma, at::Tensor* gamma,
at::Tensor* beta, at::Tensor* beta,
double epsilon, double epsilon,
...@@ -151,7 +183,11 @@ at::Tensor layer_norm_gradient( ...@@ -151,7 +183,11 @@ at::Tensor layer_norm_gradient(
at::Tensor mean, at::Tensor mean,
at::Tensor invvar, at::Tensor invvar,
at::Tensor input, at::Tensor input,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
double epsilon) { double epsilon) {
CHECK_INPUT(dout); CHECK_INPUT(dout);
CHECK_INPUT(mean); CHECK_INPUT(mean);
...@@ -170,7 +206,11 @@ std::vector<at::Tensor> layer_norm_gradient_affine( ...@@ -170,7 +206,11 @@ std::vector<at::Tensor> layer_norm_gradient_affine(
at::Tensor mean, at::Tensor mean,
at::Tensor invvar, at::Tensor invvar,
at::Tensor input, at::Tensor input,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor gamma, at::Tensor gamma,
at::Tensor beta, at::Tensor beta,
double epsilon) { double epsilon) {
......
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "type_shim.h"
template<typename U> __device__ template<typename U> __device__
void cuWelfordOnlineSum( void cuWelfordOnlineSum(
const U curr, const U curr,
...@@ -238,17 +240,20 @@ template<> double rsqrt(double v) { ...@@ -238,17 +240,20 @@ template<> double rsqrt(double v) {
namespace { namespace {
// This is the un-specialized struct. Note that we prevent instantiation of this // This is the un-specialized struct. Note that we prevent instantiation of this
// struct by putting an undefined symbol in the function body so it won't compile. // struct by putting an undefined symbol in the function body so it won't compile.
// template <typename T>
// struct SharedMemory
// {
// // Ensure that we won't compile any un-specialized types
// __device__ T *getPointer()
// {
// extern __device__ void error(void);
// error();
// return NULL;
// }
// };
// https://github.com/NVIDIA/apex/issues/246
template <typename T> template <typename T>
struct SharedMemory struct SharedMemory;
{
// Ensure that we won't compile any un-specialized types
__device__ T *getPointer()
{
extern __device__ void error(void);
error();
return NULL;
}
};
template <> template <>
struct SharedMemory <float> struct SharedMemory <float>
...@@ -670,11 +675,16 @@ void cuda_layer_norm( ...@@ -670,11 +675,16 @@ void cuda_layer_norm(
at::Tensor* input, at::Tensor* input,
int n1, int n1,
int n2, int n2,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor* gamma, at::Tensor* gamma,
at::Tensor* beta, at::Tensor* beta,
double epsilon) double epsilon)
{ {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "layer_norm_cuda_kernel", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "layer_norm_cuda_kernel", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
HostApplyLayerNorm( HostApplyLayerNorm(
...@@ -764,7 +774,11 @@ void cuda_layer_norm_gradient( ...@@ -764,7 +774,11 @@ void cuda_layer_norm_gradient(
at::Tensor* input, at::Tensor* input,
int n1, int n1,
int n2, int n2,
#ifdef VERSION_GE_1_1
at::IntArrayRef normalized_shape,
#else
at::IntList normalized_shape, at::IntList normalized_shape,
#endif
at::Tensor* gamma, at::Tensor* gamma,
at::Tensor* beta, at::Tensor* beta,
double epsilon, double epsilon,
...@@ -772,6 +786,7 @@ void cuda_layer_norm_gradient( ...@@ -772,6 +786,7 @@ void cuda_layer_norm_gradient(
at::Tensor* grad_gamma, at::Tensor* grad_gamma,
at::Tensor* grad_beta) at::Tensor* grad_beta)
{ {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "cuComputeGradInput", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "cuComputeGradInput", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
HostLayerNormGradient( HostLayerNormGradient(
......
#include <torch/extension.h>
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h> #include <ATen/cuda/Exceptions.h>
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h>
// #include <iostream> // #include <iostream>
...@@ -16,7 +14,7 @@ ...@@ -16,7 +14,7 @@
constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
template<int n> struct TensorList template<int n> struct TensorListMetadata
{ {
void* addresses[n][depth_to_max_tensors[n-1]]; void* addresses[n][depth_to_max_tensors[n-1]];
int sizes[depth_to_max_tensors[n-1]]; int sizes[depth_to_max_tensors[n-1]];
...@@ -64,7 +62,7 @@ void multi_tensor_apply( ...@@ -64,7 +62,7 @@ void multi_tensor_apply(
int ntensors = tensor_lists[0].size(); int ntensors = tensor_lists[0].size();
TensorList<depth> tl; TensorListMetadata<depth> tl;
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
......
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template<typename x_t, typename y_t, typename out_t>
struct AxpbyFunctor
{
__device__ __forceinline__ void operator()(
int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<3>& tl,
float a,
float b,
int arg_to_check)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
x_t* x = (x_t*)tl.addresses[0][tensor_loc];
x += chunk_idx*chunk_size;
y_t* y = (y_t*)tl.addresses[1][tensor_loc];
y += chunk_idx*chunk_size;
out_t* out = (out_t*)tl.addresses[2][tensor_loc];
out += chunk_idx*chunk_size;
n -= chunk_idx*chunk_size;
// Non-divergent exit condition for __syncthreads, not necessary here
float xs[ILP];
float ys[ILP];
for(int i_start = 0;
i_start < n && i_start < chunk_size;
i_start += blockDim.x*ILP)
{
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
xs[ii] = 0;
ys[ii] = 0;
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
xs[ii] = static_cast<float>(x[i]);
ys[ii] = static_cast<float>(y[i]);
}
}
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
bool finite = true;
if(arg_to_check == -1)
finite = (isfinite(xs[ii]) && isfinite(ys[ii]));
if(arg_to_check == 0)
finite = isfinite(xs[ii]);
if(arg_to_check == 1)
finite = isfinite(ys[ii]);
if(!finite)
*noop_gmem = 1; // Blindly fire off a write. These will race but that's ok.
}
}
}
}
};
void multi_tensor_axpby_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
float a,
float b,
int arg_to_check)
{
using namespace at;
// The output (downscaled) type is always float.
// If build times suffer, think about where to put this dispatch,
// and what logic should be moved out of multi_tensor_apply.
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_axpby_cuda",
DISPATCH_FLOAT_AND_HALF(tensor_lists[1][0].scalar_type(), 1, "multi_tensor_axpby_cuda",
DISPATCH_FLOAT_AND_HALF(tensor_lists[2][0].scalar_type(), 2, "multi_tensor_axpby_cuda",
multi_tensor_apply<3>(
BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
AxpbyFunctor<scalar_t_0, scalar_t_1, scalar_t_2>(),
a,
b,
arg_to_check); )))
AT_CUDA_CHECK(cudaGetLastError());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
}
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template<typename x_t>
struct L2NormFunctor
{
__device__ __forceinline__ void operator()(
int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<1>& tl,
float* output)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
x_t* x = (x_t*)tl.addresses[0][tensor_loc];
x += chunk_idx*chunk_size;
n -= chunk_idx*chunk_size;
__shared__ float vals[512];
// Non-divergent exit condition for __syncthreads, not necessary here
float val = 0;
for(int i = threadIdx.x; i < n && i < chunk_size; i += blockDim.x)
{
float next = static_cast<float>(x[i]);
val += next*next;
}
float final = reduce_block_into_lanes(vals, val);
if(threadIdx.x == 0)
{
if(!isfinite(final))
*noop_gmem = 1; // Blindly fire off a write. These will race but that's ok.
output[blockIdx.x] += final;
}
}
};
__global__ void cleanup(float* x, float* ret)
{
__shared__ float vals[512];
float val = 0;
if(threadIdx.x < 320)
val = x[threadIdx.x];
float final = reduce_block_into_lanes(vals, val);
if(threadIdx.x == 0)
*ret = sqrt(final);
}
at::Tensor multi_tensor_l2norm_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists)
{
auto output = at::zeros({320}, tensor_lists[0][0].options().dtype(at::ScalarType::Float));
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
multi_tensor_apply<1>(
BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
L2NormFunctor<scalar_t_0>(),
output.data<float>());)
AT_CUDA_CHECK(cudaGetLastError());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to end.
// I could get rid of these by hacking the functor + multi tensor harness with persistence
// logic, but keeping it simple for now
auto ret = at::empty({1}, output.options());
auto stream = at::cuda::getCurrentCUDAStream();
cleanup<<<1, 512, 0, stream>>>(output.data<float>(), ret.data<float>());
return ret;
}
...@@ -2,10 +2,15 @@ ...@@ -2,10 +2,15 @@
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h> #include <ATen/cuda/Exceptions.h>
#include "multi_tensor_apply.cuh" // Another possibility:
// #include <torch/all.h>
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h> // Stringstream is a big hammer, but I want to rely on operator<< for dtype.
#include <sstream>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
#define ILP 4 #define ILP 4
...@@ -16,16 +21,12 @@ struct ScaleFunctor ...@@ -16,16 +21,12 @@ struct ScaleFunctor
__device__ __forceinline__ void operator()( __device__ __forceinline__ void operator()(
int chunk_size, int chunk_size,
volatile int* noop_gmem, volatile int* noop_gmem,
TensorList<2>& tl, TensorListMetadata<2>& tl,
float scale) float scale)
{ {
__shared__ int noop_smem; // I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
if(threadIdx.x == 0) // return;
noop_smem = *noop_gmem;
__syncthreads();
if(noop_smem == 1)
return;
int tensor_loc = tl.block_to_tensor[blockIdx.x]; int tensor_loc = tl.block_to_tensor[blockIdx.x];
int chunk_idx = tl.block_to_chunk[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x];
...@@ -39,7 +40,7 @@ struct ScaleFunctor ...@@ -39,7 +40,7 @@ struct ScaleFunctor
n -= chunk_idx*chunk_size; n -= chunk_idx*chunk_size;
// Non-divergent exit condition for the __syncthreads // Non-divergent exit condition for __syncthreads, not necessary here
float incoming_vals[ILP]; float incoming_vals[ILP];
for(int i_start = 0; for(int i_start = 0;
i_start < n && i_start < chunk_size; i_start < n && i_start < chunk_size;
...@@ -64,20 +65,12 @@ struct ScaleFunctor ...@@ -64,20 +65,12 @@ struct ScaleFunctor
{ {
int i = i_start + threadIdx.x + ii*blockDim.x; int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size) if(i < n && i < chunk_size)
if(isfinite(incoming_vals[ii])) {
out[i] = static_cast<out_t>(incoming_vals[ii]*scale); out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
else if(!isfinite(incoming_vals[ii]))
*noop_gmem = 1; // Blindly fire off a write. These will race but that's ok. *noop_gmem = 1; // Blindly fire off a write. These will race but that's ok.
}
} }
// *noop_gmem = 1 is NOT guaranteed to be seen immediately by thread 0. I wonder if
// we can rig block-wide and grid-wide short-circuiting with only one syncthreads.
// It's possible we can just lean on the cache (no smem or syncs) and still be fast.
if(threadIdx.x == 0)
noop_smem = *noop_gmem;
__syncthreads();
if(noop_smem == 1)
break;
} }
} }
}; };
...@@ -88,15 +81,17 @@ void multi_tensor_scale_cuda( ...@@ -88,15 +81,17 @@ void multi_tensor_scale_cuda(
std::vector<std::vector<at::Tensor>> tensor_lists, std::vector<std::vector<at::Tensor>> tensor_lists,
float scale) float scale)
{ {
using namespace at;
// The output (downscaled) type is always float. // The output (downscaled) type is always float.
// If build times suffer, think about where to put this dispatch, // If build times suffer, think about where to put this dispatch,
// and what logic should be moved out of multi_tensor_apply. // and what logic should be moved out of multi_tensor_apply.
AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor_lists[0][0].type(), AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor_lists[0][0].type(),
"multi_tensor_scale_cuda", "multi_tensor_scale_cuda",
[&] [&]
{ {
// using accscalar_t = acc_type<scalar_t, true>; // using accscalar_t = acc_type<scalar_t, true>;
switch(tensor_lists[1][0].type().scalarType()) switch(tensor_lists[1][0].scalar_type())
{ {
case at::ScalarType::Half: case at::ScalarType::Half:
multi_tensor_apply<2>( multi_tensor_apply<2>(
...@@ -117,8 +112,10 @@ void multi_tensor_scale_cuda( ...@@ -117,8 +112,10 @@ void multi_tensor_scale_cuda(
scale); scale);
break; break;
default: default:
AT_ERROR("multi_tensor_scale_cuda not implemented for output type = ", std::stringstream ss;
tensor_lists[1][0].type().toString()); ss << "multi_tensor_scale_cuda not implemented for output type = "
<< tensor_lists[1][0].dtype();
AT_ERROR(ss.str().c_str());
} }
}); });
......
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <assert.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
#define NBLOCKS 160*4
#define ILP 4
// It makes sense to lock the output type to fp32 because the downscaled
// grads should be master grads (and in the case of Amp, the params and their
// gradients should always be fp32).
template<typename in_t>
__global__ void scale_reduce_overflow(in_t* in,
float* out,
int n,
float scale,
volatile int* overflow_global)
{
__shared__ int overflow;
float incoming_vals[4];
// Non-divergent exit condition for the __syncthreads
for(int chunk_start = blockIdx.x*blockDim.x*ILP;
chunk_start < n;
chunk_start += gridDim.x*blockDim.x*ILP)
{
if(threadIdx.x == 0)
overflow = *overflow_global;
__syncthreads();
if(overflow == 1)
break;
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
incoming_vals[ii] = 0;
int i = chunk_start + threadIdx.x + ii*blockDim.x;
if(i < n)
incoming_vals[ii] = static_cast<float>(in[i]);
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = chunk_start + threadIdx.x + ii*blockDim.x;
if(i < n)
if(isfinite(incoming_vals[ii]))
out[i] = incoming_vals[ii]*scale;
else
*overflow_global = 1; // Blindly fire off a write. These will race but that's ok.
} // This is NOT guaranteed to be seen immediately by thread 0 on the next iteration.
} // I wonder if there's a way we can rig the short-circuiting with only one syncthreads.
} // It's possible we can just lean on the cache (no smem or syncs) and still be fast.
void scale_check_overflow_cuda
(const at::Tensor& grads,
float scale,
const at::Tensor& overflow_buf,
const at::Tensor& downscaled_grads)
{
using namespace at;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int n = grads.numel();
// Lock the output (downscaled) type to float.
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grads.type(),
"scale_check_overflow_cuda",
[&]
{
// using accscalar_t = acc_type<scalar_t, true>;
scale_reduce_overflow<<<NBLOCKS, BLOCK_SIZE, 0, stream>>>
(grads.data<scalar_t>(),
downscaled_grads.data<float>(),
n,
scale,
overflow_buf.data<int>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
#include <ATen/ATen.h>
// Forward/backward compatiblity hack around
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
// pending more future-proof guidance from upstream.
struct TypeShim
{
const at::Type& payload;
TypeShim(const at::Type& type) : payload(type) {}
// Enable trivial conversion to a const at::Type& for pre-3aeb78
operator const at::Type&(){ return payload; };
// Enable dispatch switch statements to take *this directly for post-3aeb78
operator at::ScalarType(){ return payload.scalarType(); };
};
#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
switch(TYPE) \
{ \
case at::ScalarType::Float: \
{ \
using scalar_t_##LEVEL = float; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::Half: \
{ \
using scalar_t_##LEVEL = at::Half; \
__VA_ARGS__; \
break; \
} \
default: \
AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
}
template<typename T>
__device__ __forceinline__ T reduce_block_into_lanes
(T *x,
T val,
int lanes=1,
bool share_result=false) // lanes is intended to be <= 32.
{
int tid = threadIdx.x + threadIdx.y*blockDim.x;
int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
if(blockSize >= 64)
{
x[tid] = val;
__syncthreads();
}
#pragma unroll
for(int i = (blockSize >> 1); i >= 64; i >>= 1)
{
if(tid < i)
x[tid] = x[tid] + x[tid+i];
__syncthreads();
}
T final;
if(tid < 32)
{
if(blockSize >= 64)
final = x[tid] + x[tid+32];
else
final = val;
// __SYNCWARP();
#pragma unroll
for(int i = 16; i >= lanes; i >>= 1)
final = final + __shfl_down_sync(0xffffffff, final, i);
}
if(share_result)
{
if(tid < lanes)
x[tid] = final; // EpilogueOp
// Make sure the smem result is visible to all warps.
__syncthreads();
}
return final;
}
...@@ -3,13 +3,13 @@ ...@@ -3,13 +3,13 @@
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <vector> #include <vector>
#include "type_shim.h"
__device__ __forceinline__ int lastpow2(int n) __device__ __forceinline__ int lastpow2(int n)
{ {
...@@ -844,16 +844,19 @@ std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) { ...@@ -844,16 +844,19 @@ std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_kernel", ([&] { {
using accscalar_t = at::acc_type<scalar_t, true>; using namespace at;
welford_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>( AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_kernel", ([&] {
input.data<scalar_t>(), using accscalar_t = at::acc_type<scalar_t, true>;
out_mean.data<accscalar_t>(), welford_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
out_var_biased.data<accscalar_t>(), input.data<scalar_t>(),
batch_size, out_mean.data<accscalar_t>(),
feature_size, out_var_biased.data<accscalar_t>(),
space_size); batch_size,
})); feature_size,
space_size);
}));
}
return {out_mean, out_var_biased}; return {out_mean, out_var_biased};
} }
...@@ -881,6 +884,7 @@ at::Tensor batchnorm_forward_CUDA( ...@@ -881,6 +884,7 @@ at::Tensor batchnorm_forward_CUDA(
if (input.type().scalarType() == at::ScalarType::Half if (input.type().scalarType() == at::ScalarType::Half
&& weight.has_value() && && weight.has_value() &&
weight.value().type().scalarType() == at::ScalarType::Float) { weight.value().type().scalarType() == at::ScalarType::Float) {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_forward_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>( batchnorm_forward_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
...@@ -898,6 +902,7 @@ at::Tensor batchnorm_forward_CUDA( ...@@ -898,6 +902,7 @@ at::Tensor batchnorm_forward_CUDA(
AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(), AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
"input.type().scalarType() is not supported with weight.type().scalarType()"); "input.type().scalarType() is not supported with weight.type().scalarType()");
} }
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_forward_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>( batchnorm_forward_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>(
...@@ -950,6 +955,7 @@ std::vector<at::Tensor> reduce_bn_CUDA( ...@@ -950,6 +955,7 @@ std::vector<at::Tensor> reduce_bn_CUDA(
if (input.type().scalarType() == at::ScalarType::Half if (input.type().scalarType() == at::ScalarType::Half
&& weight.has_value() && && weight.has_value() &&
weight.value().type().scalarType() == at::ScalarType::Float) { weight.value().type().scalarType() == at::ScalarType::Float) {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
reduce_bn_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>( reduce_bn_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
...@@ -970,6 +976,7 @@ std::vector<at::Tensor> reduce_bn_CUDA( ...@@ -970,6 +976,7 @@ std::vector<at::Tensor> reduce_bn_CUDA(
AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(), AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
"input.type().scalarType() is not supported with weight.type().scalarType()"); "input.type().scalarType() is not supported with weight.type().scalarType()");
} }
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
reduce_bn_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>( reduce_bn_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>(
...@@ -1017,6 +1024,7 @@ at::Tensor batchnorm_backward_CUDA( ...@@ -1017,6 +1024,7 @@ at::Tensor batchnorm_backward_CUDA(
if (input.type().scalarType() == at::ScalarType::Half if (input.type().scalarType() == at::ScalarType::Half
&& weight.has_value() && && weight.has_value() &&
weight.value().type().scalarType() == at::ScalarType::Float) { weight.value().type().scalarType() == at::ScalarType::Float) {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_backward_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>( batchnorm_backward_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
...@@ -1036,6 +1044,7 @@ at::Tensor batchnorm_backward_CUDA( ...@@ -1036,6 +1044,7 @@ at::Tensor batchnorm_backward_CUDA(
AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(), AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
"input.type().scalarType() is not supported with weight.type().scalarType()"); "input.type().scalarType() is not supported with weight.type().scalarType()");
} }
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_backward_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>( batchnorm_backward_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>(
...@@ -1072,18 +1081,21 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node ...@@ -1072,18 +1081,21 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(mean_feature_nodes.type(), "welford_parallel_kernel", ([&] { {
welford_kernel_parallel<scalar_t><<<grid, block, 0, stream>>>( using namespace at;
mean_feature_nodes.data<scalar_t>(), AT_DISPATCH_FLOATING_TYPES_AND_HALF(mean_feature_nodes.type(), "welford_parallel_kernel", ([&] {
var_biased.data<scalar_t>(), welford_kernel_parallel<scalar_t><<<grid, block, 0, stream>>>(
out_mean.data<scalar_t>(), mean_feature_nodes.data<scalar_t>(),
out_var.data<scalar_t>(), var_biased.data<scalar_t>(),
inv_std.data<scalar_t>(), out_mean.data<scalar_t>(),
world_size, out_var.data<scalar_t>(),
feature_size, inv_std.data<scalar_t>(),
eps, world_size,
numel); feature_size,
})); eps,
numel);
}));
}
return {out_mean, out_var, inv_std}; return {out_mean, out_var, inv_std};
} }
...@@ -1111,21 +1123,23 @@ std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) { ...@@ -1111,21 +1123,23 @@ std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) {
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_c_last", ([&] { using namespace at;
using accscalar_t = at::acc_type<scalar_t, true>; AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_c_last", ([&] {
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr; using accscalar_t = at::acc_type<scalar_t, true>;
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr; accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
welford_kernel_c_last<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER> int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
<<<grid, block, 0, stream>>>( welford_kernel_c_last<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
input.data<scalar_t>(), <<<grid, block, 0, stream>>>(
out_mean.data<accscalar_t>(), input.data<scalar_t>(),
out_var_biased.data<accscalar_t>(), out_mean.data<accscalar_t>(),
staging_data_ptr, out_var_biased.data<accscalar_t>(),
semaphores_ptr, staging_data_ptr,
reduction_size, semaphores_ptr,
stride); reduction_size,
})); stride);
}));
}
return {out_mean, out_var_biased}; return {out_mean, out_var_biased};
} }
...@@ -1149,6 +1163,7 @@ at::Tensor batchnorm_forward_c_last_CUDA( ...@@ -1149,6 +1163,7 @@ at::Tensor batchnorm_forward_c_last_CUDA(
if (input.type().scalarType() == at::ScalarType::Half if (input.type().scalarType() == at::ScalarType::Half
&& weight.has_value() && weight.value().type().scalarType() == at::ScalarType::Float) { && weight.has_value() && weight.value().type().scalarType() == at::ScalarType::Float) {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_forward_c_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER> batchnorm_forward_c_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
...@@ -1167,6 +1182,7 @@ at::Tensor batchnorm_forward_c_last_CUDA( ...@@ -1167,6 +1182,7 @@ at::Tensor batchnorm_forward_c_last_CUDA(
AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(), AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
"input.type().scalarType() is not supported with weight.type().scalarType()"); "input.type().scalarType() is not supported with weight.type().scalarType()");
} }
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_forward_c_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER> batchnorm_forward_c_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER>
...@@ -1222,6 +1238,7 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA( ...@@ -1222,6 +1238,7 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
if (input.type().scalarType() == at::ScalarType::Half if (input.type().scalarType() == at::ScalarType::Half
&& weight.has_value() && weight.has_value()
&& weight.value().type().scalarType() == at::ScalarType::Float) { && weight.value().type().scalarType() == at::ScalarType::Float) {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr; accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
...@@ -1246,6 +1263,7 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA( ...@@ -1246,6 +1263,7 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(), AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
"input.type().scalarType() is not supported with weight.type().scalarType()"); "input.type().scalarType() is not supported with weight.type().scalarType()");
} }
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr; accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
...@@ -1291,6 +1309,7 @@ at::Tensor batchnorm_backward_c_last_CUDA( ...@@ -1291,6 +1309,7 @@ at::Tensor batchnorm_backward_c_last_CUDA(
if (input.type().scalarType() == at::ScalarType::Half if (input.type().scalarType() == at::ScalarType::Half
&& weight.has_value() && weight.value().type().scalarType() == at::ScalarType::Float) { && weight.has_value() && weight.value().type().scalarType() == at::ScalarType::Float) {
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_backward_c_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER> batchnorm_backward_c_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
...@@ -1311,6 +1330,7 @@ at::Tensor batchnorm_backward_c_last_CUDA( ...@@ -1311,6 +1330,7 @@ at::Tensor batchnorm_backward_c_last_CUDA(
AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(), AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
"input.type().scalarType() is not supported with weight.type().scalarType()"); "input.type().scalarType() is not supported with weight.type().scalarType()");
} }
using namespace at;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
using accscalar_t = at::acc_type<scalar_t, true>; using accscalar_t = at::acc_type<scalar_t, true>;
batchnorm_backward_c_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER> batchnorm_backward_c_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER>
......
...@@ -15,31 +15,43 @@ is under construction. ...@@ -15,31 +15,43 @@ is under construction.
Gradient clipping Gradient clipping
----------------- -----------------
If Amp uses master params distinct from the model params, Amp calls the params owned directly by the optimizer's ``param_groups`` the "master params."
then the params ``step()``\ ed by the optimizer are the master params,
and it is the master gradients (rather than the model gradients) that must be clipped.
If Amp is not using master params distinct from the model params, then the optimizer These master params may be fully or partially distinct from ``model.parameters()``.
directly steps the model params, and the model grads must be clipped. For example, with `opt_level="O2"`_, ``amp.initialize`` casts most model params to FP16,
creates an FP32 master param outside the model for each newly-FP16 model param,
and updates the optimizer's ``param_groups`` to point to these FP32 params.
In both cases, correct practice is to clip the gradients of the params that are about to be stepped **by the optimizer** (which may be distinct from ``model.parameters()``). The master params owned by the optimizer's ``param_groups`` may also fully coincide with the
model params, which is typically true for ``opt_level``\s ``O0``, ``O1``, and ``O3``.
Also, if Amp uses loss scaling, gradients must be clipped after they have been unscaled. In all cases, correct practice is to clip the gradients of the params that are guaranteed to be
owned **by the optimizer's** ``param_groups``, instead of those retrieved via ``model.parameters()``.
The following pattern accounts for all possibilities, and should be correct for Also, if Amp uses loss scaling, gradients must be clipped after they have been unscaled
any ``opt_level``:: (which occurs during exit from the ``amp.scale_loss`` context manager).
The following pattern should be correct for any ``opt_level``::
with amp.scale_loss(loss, optimizer) as scaled_loss: with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
# Gradients are unscaled during context manager exit. # Gradients are unscaled during context manager exit.
# Now it's safe to clip: # Now it's safe to clip. Replace
# torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
# with
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm) torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
# or # or
torch.nn.utils.clip_grad_value_(amp.master_params(optimizer), max_) torch.nn.utils.clip_grad_value_(amp.master_params(optimizer), max_)
Note the use of the utility function ``amp.master_params(optimizer)``, Note the use of the utility function ``amp.master_params(optimizer)``,
which returns a generator-expression that iterates over the which returns a generator-expression that iterates over the
params that the optimizer steps (master params if enabled, otherwise model params). params in the optimizer's ``param_groups``.
Also note that ``clip_grad_norm_(amp.master_params(optimizer), max_norm)`` is invoked
*instead of*, not *in addition to*, ``clip_grad_norm_(model.parameters(), max_norm)``.
.. _`opt_level="O2"`:
https://nvidia.github.io/apex/amp.html#o2-fast-mixed-precision
Custom/user-defined autograd functions Custom/user-defined autograd functions
-------------------------------------- --------------------------------------
...@@ -56,8 +68,11 @@ Forcing particular layers/functions to a desired type ...@@ -56,8 +68,11 @@ Forcing particular layers/functions to a desired type
I'm still working on a generalizable exposure for this that won't require user-side code divergence I'm still working on a generalizable exposure for this that won't require user-side code divergence
across different ``opt-level``\ s. across different ``opt-level``\ s.
Multiple models/optimizers Multiple models/optimizers/losses
-------------------------- ---------------------------------
Initialization with multiple models/optimizers
**********************************************
``amp.initialize``'s optimizer argument may be a single optimizer or a list of optimizers, ``amp.initialize``'s optimizer argument may be a single optimizer or a list of optimizers,
as long as the output you accept has the same type. as long as the output you accept has the same type.
...@@ -65,35 +80,88 @@ Similarly, the ``model`` argument may be a single model or a list of models, as ...@@ -65,35 +80,88 @@ Similarly, the ``model`` argument may be a single model or a list of models, as
output matches. The following calls are all legal:: output matches. The following calls are all legal::
model, optim = amp.initialize(model, optim,...) model, optim = amp.initialize(model, optim,...)
model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...) model, [optim0, optim1] = amp.initialize(model, [optim0, optim1],...)
[model1, model2], optim = amp.initialize([model1, model2], optim,...) [model0, model1], optim = amp.initialize([model0, model1], optim,...)
[model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...) [model0, model1], [optim0, optim1] = amp.initialize([model0, model1], [optim0, optim1],...)
Whenever you invoke a backward pass, the optimizer you should pass to ``amp.scaled_loss`` is whatever Backward passes with multiple optimizers
optimizer owns the parameters for which this particular backward pass is creating gradients. ****************************************
Multiple backward passes per iteration Whenever you invoke a backward pass, the ``amp.scale_loss`` context manager must receive
-------------------------------------- **all the optimizers that own any params for which the current backward pass is creating gradients.**
This is true even if each optimizer owns only some, but not all, of the params that are about to
receive gradients.
If you want to accumulate gradients from multiple losses for the params owned by a given optimizer, If, for a given backward pass, there's only one optimizer whose params are about to receive gradients,
you must invoke ``with amp.scale_loss(..., delay_unscale=True)`` for all backward passes except you may pass that optimizer directly to ``amp.scale_loss``. Otherwise, you must pass the
the last:: list of optimizers whose params are about to receive gradients::
# delay_unscale=True for the first two losses # loss0 accumulates gradients only into params owned by optim0:
with amp.scale_loss(loss1, optimizer, delay_unscale=True) as scaled_loss: with amp.scale_loss(loss0, optim0) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
with amp.scale_loss(loss2, optimizer, delay_unscale=True) as scaled_loss:
# loss1 accumulates gradients only into params owned by optim1:
with amp.scale_loss(loss1, optim1) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
# Don't delay_unscale for the final loss
with amp.scale_loss(loss3, optimizer) as scaled_loss: # loss2 accumulates gradients into some params owned by optim0
# and some params owned by optim1
with amp.scale_loss(loss2, [optim0, optim1]) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
optimizer.step()
Optionally have Amp use a different loss scaler per-loss
********************************************************
By default, Amp maintains a single global loss scaler that will be used for all backward passes
(all invocations of ``with amp.scale_loss(...)``). No additional arguments to ``amp.initialize``
or ``amp.scale_loss`` are required to use the global loss scaler. The code snippets above with
multiple optimizers/backward passes use the single global loss scaler under the hood,
and they should "just work."
However, you can optionally tell Amp to maintain a loss scaler per-loss, which gives Amp increased
numerical flexibility. This is accomplished by supplying the ``num_losses`` argument to
``amp.initialize`` (which tells Amp how many backward passes you plan to invoke, and therefore
how many loss scalers Amp should create), then supplying the ``loss_id`` argument to each of your
backward passes (which tells Amp the loss scaler to use for this particular backward pass)::
model, [optim0, optim1] = amp.initialize(model, [optim0, optim1], ..., num_losses=3)
with amp.scale_loss(loss0, optim0, loss_id=0) as scaled_loss:
scaled_loss.backward()
with amp.scale_loss(loss1, optim1, loss_id=1) as scaled_loss:
scaled_loss.backward()
with amp.scale_loss(loss2, [optim0, optim1], loss_id=2) as scaled_loss:
scaled_loss.backward()
``num_losses`` and ``loss_id``\ s should be specified purely based on the set of
losses/backward passes. The use of multiple optimizers, or association of single or
multiple optimizers with each backward pass, is unrelated.
Gradient accumulation across iterations Gradient accumulation across iterations
--------------------------------------- ---------------------------------------
Pass ``delay_unscale=True`` to ``amp.scale_loss`` until you're ready to ``step()``:: The following should "just work," and properly accommodate multiple models/optimizers/losses, as well as
gradient clipping via the `instructions above`_::
if iter%iters_to_accumulate == 0:
# Every iters_to_accumulate iterations, unscale and step
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
# Gradient clipping if desired:
# torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
optimizer.step()
optimizer.zero_grad()
else:
# Otherwise, accumulate gradients, don't unscale or step.
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
As a minor performance optimization, you can pass ``delay_unscale=True``
to ``amp.scale_loss`` until you're ready to ``step()``. You should only attempt ``delay_unscale=True``
if you're sure you know what you're doing, because the interaction with gradient clipping and
multiple models/optimizers/losses can become tricky.::
if iter%iters_to_accumulate == 0: if iter%iters_to_accumulate == 0:
# Every iters_to_accumulate iterations, unscale and step # Every iters_to_accumulate iterations, unscale and step
...@@ -102,6 +170,48 @@ Pass ``delay_unscale=True`` to ``amp.scale_loss`` until you're ready to ``step() ...@@ -102,6 +170,48 @@ Pass ``delay_unscale=True`` to ``amp.scale_loss`` until you're ready to ``step()
optimizer.step() optimizer.step()
optimizer.zero_grad() optimizer.zero_grad()
else: else:
# Otherwise, just accumulate gradients, don't unscale or step. # Otherwise, accumulate gradients, don't unscale or step.
with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss: with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
.. _`instructions above`:
https://nvidia.github.io/apex/advanced.html#gradient-clipping
Custom data batch types
-----------------------
The intention of Amp is that you never need to cast your input data manually, regardless of
``opt_level``. Amp accomplishes this by patching any models' ``forward`` methods to cast
incoming data appropriately for the ``opt_level``. But to cast incoming data,
Amp needs to know how. The patched ``forward`` will recognize and cast floating-point Tensors
(non-floating-point Tensors like IntTensors are not touched) and
Python containers of floating-point Tensors. However, if you wrap your Tensors in a custom class,
the casting logic doesn't know how to drill
through the tough custom shell to access and cast the juicy Tensor meat within. You need to tell
Amp how to cast your custom batch class, by assigning it a ``to`` method that accepts a ``torch.dtype``
(e.g., ``torch.float16`` or ``torch.float32``) and returns an instance of the custom batch cast to
``dtype``. The patched ``forward`` checks for the presence of your ``to`` method, and will
invoke it with the correct type for the ``opt_level``.
Example::
class CustomData(object):
def __init__(self):
self.tensor = torch.cuda.FloatTensor([1,2,3])
def to(self, dtype):
self.tensor = self.tensor.to(dtype)
return self
.. warning::
Amp also forwards numpy ndarrays without casting them. If you send input data as a raw, unwrapped
ndarray, then later use it to create a Tensor within your ``model.forward``, this Tensor's type will
not depend on the ``opt_level``, and may or may not be correct. Users are encouraged to pass
castable data inputs (Tensors, collections of Tensors, or custom classes with a ``to`` method)
wherever possible.
.. note::
Amp does not call ``.cuda()`` on any Tensors for you. Amp assumes that your original script
is already set up to move Tensors from the host to the device as needed.
...@@ -13,6 +13,13 @@ on the Github page. ...@@ -13,6 +13,13 @@ on the Github page.
GANs are a tricky case that many people have requested. A `comprehensive DCGAN example`_ GANs are a tricky case that many people have requested. A `comprehensive DCGAN example`_
is under construction. is under construction.
If you already implemented Amp based on the instructions below, but it isn't behaving as expected,
please review `Advanced Amp Usage`_ to see if any topics match your use case. If that doesn't help,
`file an issue`_.
.. _`file an issue`:
https://github.com/NVIDIA/apex/issues
``opt_level``\ s and Properties ``opt_level``\ s and Properties
------------------------------- -------------------------------
...@@ -26,8 +33,8 @@ override the defaults established by the ``opt_level``. ...@@ -26,8 +33,8 @@ override the defaults established by the ``opt_level``.
Example:: Example::
# Declare model and optimizer as usual # Declare model and optimizer as usual, with default (FP32) precision
model = torch.nn.Linear(D_in, D_out).cuda().half() model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
# Allow Amp to perform casts as required by the opt_level # Allow Amp to perform casts as required by the opt_level
...@@ -55,6 +62,9 @@ In this way, there's no risk adhering to the Amp API, and a lot of potential per ...@@ -55,6 +62,9 @@ In this way, there's no risk adhering to the Amp API, and a lot of potential per
.. _`comprehensive DCGAN example`: .. _`comprehensive DCGAN example`:
https://github.com/NVIDIA/apex/tree/master/examples/dcgan https://github.com/NVIDIA/apex/tree/master/examples/dcgan
.. _`Advanced Amp Usage`:
https://nvidia.github.io/apex/advanced.html
Properties Properties
********** **********
...@@ -102,9 +112,8 @@ Your incoming model should be FP32 already, so this is likely a no-op. ...@@ -102,9 +112,8 @@ Your incoming model should be FP32 already, so this is likely a no-op.
| |
| |
``O1``: Conservative Mixed Precision ``O1``: Mixed Precision (recommended for typical use)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Patch all Torch functions and Tensor methods to cast their inputs according to a whitelist-blacklist Patch all Torch functions and Tensor methods to cast their inputs according to a whitelist-blacklist
model. Whitelist ops (for example, Tensor Core-friendly ops like GEMMs and convolutions) are performed model. Whitelist ops (for example, Tensor Core-friendly ops like GEMMs and convolutions) are performed
in FP16. Blacklist ops that benefit from FP32 precision (for example, softmax) in FP16. Blacklist ops that benefit from FP32 precision (for example, softmax)
...@@ -119,11 +128,14 @@ are performed in FP32. ``O1`` also uses dynamic loss scaling, unless overridden ...@@ -119,11 +128,14 @@ are performed in FP32. ``O1`` also uses dynamic loss scaling, unless overridden
| |
| |
``O2``: Fast Mixed Precision ``O2``: "Almost FP16" Mixed Precision
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
``O2`` casts the model weights to FP16, ``O2`` casts the model weights to FP16,
patches the model's ``forward`` method to cast input patches the model's ``forward`` method to cast input
data to FP16, keeps batchnorms in FP32, maintains FP32 master weights, data to FP16, keeps batchnorms in FP32, maintains FP32 master weights,
updates the optimizer's ``param_groups`` so that the ``optimizer.step()``
acts directly on the FP32 weights (followed by FP32 master weight->FP16 model weight
copies if necessary),
and implements dynamic loss scaling (unless overridden). and implements dynamic loss scaling (unless overridden).
Unlike ``O1``, ``O2`` does not patch Torch functions or Tensor methods. Unlike ``O1``, ``O2`` does not patch Torch functions or Tensor methods.
...@@ -170,7 +182,7 @@ Advanced use cases ...@@ -170,7 +182,7 @@ Advanced use cases
The unified Amp API supports gradient accumulation across iterations, The unified Amp API supports gradient accumulation across iterations,
multiple backward passes per iteration, multiple models/optimizers, multiple backward passes per iteration, multiple models/optimizers,
and custom/user-defined autograd functions. Gradient clipping and GANs also custom/user-defined autograd functions, and custom data batch classes. Gradient clipping and GANs also
require special treatment, but this treatment does not need to change require special treatment, but this treatment does not need to change
for different ``opt_level``\ s. Further details can be found here: for different ``opt_level``\ s. Further details can be found here:
......
...@@ -68,7 +68,6 @@ parser.add_argument("--local_rank", default=0, type=int) ...@@ -68,7 +68,6 @@ parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument('--sync_bn', action='store_true', parser.add_argument('--sync_bn', action='store_true',
help='enabling apex sync BN.') help='enabling apex sync BN.')
parser.add_argument('--has-ext', action='store_true')
parser.add_argument('--opt-level', type=str) parser.add_argument('--opt-level', type=str)
parser.add_argument('--keep-batchnorm-fp32', type=str, default=None) parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
parser.add_argument('--loss-scale', type=str, default=None) parser.add_argument('--loss-scale', type=str, default=None)
...@@ -118,7 +117,7 @@ def main(): ...@@ -118,7 +117,7 @@ def main():
args.world_size = 1 args.world_size = 1
if args.distributed: if args.distributed:
args.gpu = args.local_rank % torch.cuda.device_count() args.gpu = args.local_rank
torch.cuda.set_device(args.gpu) torch.cuda.set_device(args.gpu)
torch.distributed.init_process_group(backend='nccl', torch.distributed.init_process_group(backend='nccl',
init_method='env://') init_method='env://')
...@@ -334,10 +333,8 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -334,10 +333,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
optimizer.step() optimizer.step()
if args.prof: torch.cuda.nvtx.range_pop() if args.prof: torch.cuda.nvtx.range_pop()
input, target = prefetcher.next()
if i%args.print_freq == 0: if i%args.print_freq == 0:
# Every print_freq iterations, check the loss accuracy and speed. # Every print_freq iterations, check the loss, accuracy, and speed.
# For best performance, it doesn't make sense to print these metrics every # For best performance, it doesn't make sense to print these metrics every
# iteration, since they incur an allreduce and some host<->device syncs. # iteration, since they incur an allreduce and some host<->device syncs.
...@@ -374,6 +371,8 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -374,6 +371,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
batch_time=batch_time, batch_time=batch_time,
loss=losses, top1=top1, top5=top5)) loss=losses, top1=top1, top5=top5))
input, target = prefetcher.next()
def validate(val_loader, model, criterion): def validate(val_loader, model, criterion):
batch_time = AverageMeter() batch_time = AverageMeter()
......
**distributed_data_parallel.py** and **run.sh** show an example using Amp with
[apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The use of `Amp` with DistributedDataParallel does not need to change from ordinary
single-process use. The only gotcha is that wrapping your model with `DistributedDataParallel` must
come after the call to `amp.initialize`. Test via
```bash
bash run.sh
```
**This is intended purely as an instructional example, not a performance showcase.**
import torch
import argparse
import os
from apex import amp
# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
from apex.parallel import DistributedDataParallel
parser = argparse.ArgumentParser()
# FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
# FOR DISTRIBUTED: If we are running under torch.distributed.launch,
# the 'WORLD_SIZE' environment variable will also be set automatically.
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
if args.distributed:
# FOR DISTRIBUTED: Set the device according to local_rank.
torch.cuda.set_device(args.local_rank)
# FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will provide
# environment variables, and requires that you use init_method=`env://`.
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
# Each process receives its own batch of "fake input data" and "fake target data."
# The "training loop" in each process just uses this fake batch over and over.
# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
# example of distributed data sampling for both training and validation.
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')
model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
if args.distributed:
# FOR DISTRIBUTED: After amp.initialize, wrap the model with
# apex.parallel.DistributedDataParallel.
model = DistributedDataParallel(model)
# torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
# model = torch.nn.parallel.DistributedDataParallel(model,
# device_ids=[args.local_rank],
# output_device=args.local_rank)
loss_fn = torch.nn.MSELoss()
for t in range(500):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred, y)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if args.local_rank == 0:
print("final loss = ", loss)
#!/bin/bash
python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
import torch import torch
from setuptools import setup, find_packages from setuptools import setup, find_packages
import subprocess
import sys import sys
if not torch.cuda.is_available(): if not torch.cuda.is_available():
print("Warning: Torch did not find available GPUs on this system.\n", print("\nWarning: Torch did not find available GPUs on this system.\n",
"If your intention is to cross-compile, this is not an error.") "If your intention is to cross-compile, this is not an error.\n")
print("torch.__version__ = ", torch.__version__) print("torch.__version__ = ", torch.__version__)
TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MAJOR = int(torch.__version__.split('.')[0])
...@@ -21,7 +22,7 @@ ext_modules = [] ...@@ -21,7 +22,7 @@ ext_modules = []
if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv: if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
if TORCH_MAJOR == 0: if TORCH_MAJOR == 0:
raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, " raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
"found torch.__version__ = {}".format(torch.__version)) "found torch.__version__ = {}".format(torch.__version__))
from torch.utils.cpp_extension import BuildExtension from torch.utils.cpp_extension import BuildExtension
cmdclass['build_ext'] = BuildExtension cmdclass['build_ext'] = BuildExtension
...@@ -32,6 +33,25 @@ if "--cpp_ext" in sys.argv: ...@@ -32,6 +33,25 @@ if "--cpp_ext" in sys.argv:
CppExtension('apex_C', CppExtension('apex_C',
['csrc/flatten_unflatten.cpp',])) ['csrc/flatten_unflatten.cpp',]))
def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
torch_binary_major = torch.version.cuda.split(".")[0]
torch_binary_minor = torch.version.cuda.split(".")[1]
print("\nCompiling cuda extensions with")
print(raw_output + "from " + cuda_dir + "/bin\n")
if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
# TODO: make this a hard error?
print("\nWarning: Cuda extensions are being compiled with a version of Cuda that does "
"not match the version used to compile Pytorch binaries.\n")
print("Pytorch binaries were compiled with Cuda {}\n".format(torch.version.cuda))
if "--cuda_ext" in sys.argv: if "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--cuda_ext") sys.argv.remove("--cuda_ext")
...@@ -39,6 +59,8 @@ if "--cuda_ext" in sys.argv: ...@@ -39,6 +59,8 @@ if "--cuda_ext" in sys.argv:
if torch.utils.cpp_extension.CUDA_HOME is None: if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else: else:
check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
# Set up macros for forward/backward compatibility hack around # Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
version_ge_1_1 = [] version_ge_1_1 = []
...@@ -48,19 +70,21 @@ if "--cuda_ext" in sys.argv: ...@@ -48,19 +70,21 @@ if "--cuda_ext" in sys.argv:
ext_modules.append( ext_modules.append(
CUDAExtension(name='amp_C', CUDAExtension(name='amp_C',
sources=['csrc/amp_C_frontend.cpp', sources=['csrc/amp_C_frontend.cpp',
'csrc/scale_check_overflow_kernel.cu', 'csrc/multi_tensor_sgd_kernel.cu',
'csrc/multi_tensor_scale_kernel.cu', 'csrc/multi_tensor_scale_kernel.cu',
'csrc/multi_tensor_sgd_kernel.cu'], 'csrc/multi_tensor_axpby_kernel.cu',
'csrc/multi_tensor_l2norm_kernel.cu'],
extra_compile_args={'cxx': ['-O3'], extra_compile_args={'cxx': ['-O3'],
'nvcc':['-lineinfo', 'nvcc':['-lineinfo',
'-O3', '-O3',
# '--resource-usage',
'--use_fast_math']})) '--use_fast_math']}))
ext_modules.append( ext_modules.append(
CUDAExtension(name='fused_adam_cuda', CUDAExtension(name='fused_adam_cuda',
sources=['apex/optimizers/csrc/fused_adam_cuda.cpp', sources=['csrc/fused_adam_cuda.cpp',
'apex/optimizers/csrc/fused_adam_cuda_kernel.cu'], 'csrc/fused_adam_cuda_kernel.cu'],
extra_compile_args={'cxx': ['-O3',], extra_compile_args={'cxx': ['-O3',],
'nvcc':['-O3', 'nvcc':['-O3',
'--use_fast_math']})) '--use_fast_math']}))
ext_modules.append( ext_modules.append(
CUDAExtension(name='syncbn', CUDAExtension(name='syncbn',
...@@ -68,11 +92,11 @@ if "--cuda_ext" in sys.argv: ...@@ -68,11 +92,11 @@ if "--cuda_ext" in sys.argv:
'csrc/welford.cu'])) 'csrc/welford.cu']))
ext_modules.append( ext_modules.append(
CUDAExtension(name='fused_layer_norm_cuda', CUDAExtension(name='fused_layer_norm_cuda',
sources=['apex/normalization/csrc/layer_norm_cuda.cpp', sources=['csrc/layer_norm_cuda.cpp',
'apex/normalization/csrc/layer_norm_cuda_kernel.cu'], 'csrc/layer_norm_cuda_kernel.cu'],
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1, extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
'nvcc':['-maxrregcount=50', 'nvcc':['-maxrregcount=50',
'-O3', '-O3',
'--use_fast_math'] + version_ge_1_1})) '--use_fast_math'] + version_ge_1_1}))
print(ext_modules) print(ext_modules)
......
...@@ -4,6 +4,7 @@ import functools as ft ...@@ -4,6 +4,7 @@ import functools as ft
import itertools as it import itertools as it
from apex import amp from apex import amp
from apex.amp import _amp_state
import torch import torch
from torch import nn from torch import nn
import torch.nn.functional as F import torch.nn.functional as F
...@@ -60,24 +61,27 @@ class PromoteModule(torch.nn.Module): ...@@ -60,24 +61,27 @@ class PromoteModule(torch.nn.Module):
class TestCache(unittest.TestCase): class TestCache(unittest.TestCase):
def setUp(self): def setUp(self):
self.handle = amp.init(enabled=True)
self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32) self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32)
common_init(self) common_init(self)
def tearDown(self): def tearDown(self):
self.handle._deactivate() pass
def train_eval_train_test(self, module, t): def train_eval_train_test(self, module, t):
model = module(t).cuda() model = module(t).cuda()
dummy_optimizer = torch.optim.SGD(model.parameters(), lr=1.0) optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
_amp_state.allow_incoming_model_not_fp32 = True
model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
_amp_state.allow_incoming_model_not_fp32 = False
def training_step(): def training_step():
for param in model.parameters(): for param in model.parameters():
param.grad = None param.grad = None
loss = model(self.x).sum() loss = model(self.x).sum()
self.handle._default_scaler._loss_scale = 4.0 _amp_state.loss_scalers[0]._loss_scale = 4.0
with self.handle.scale_loss(loss, dummy_optimizer) as scaled_loss: with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1) self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1)
...@@ -105,6 +109,8 @@ class TestCache(unittest.TestCase): ...@@ -105,6 +109,8 @@ class TestCache(unittest.TestCase):
# Simulates resuming training after eval # Simulates resuming training after eval
training_step() training_step()
_amp_state.handle._deactivate()
# I could easily have these as a set of for loops in a single test, # I could easily have these as a set of for loops in a single test,
# instead of going for granularity. # instead of going for granularity.
......
import unittest
import functools as ft
import itertools as it
from apex import amp
import torch
from torch import nn
import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
try:
import amp_C
from amp_C import multi_tensor_axpby
from apex.multi_tensor_apply import MultiTensorApply
disabled = False
except ImportError as err:
print("amp_C fused kernels unavailable, disabling TestMultiTensorApply. ImportError was ", err)
disabled = True
class TestMultiTensorAxpby(unittest.TestCase):
def setUp(self):
common_init(self)
self.a = 2.0
self.b = 8.0
self.xval = 4.0
self.yval = 16.0
self.overflow_buf = torch.cuda.IntTensor(1).zero_()
self.ref = torch.cuda.FloatTensor([136.0])
def tearDown(self):
pass
# The tensor creation here is written for convenience, not speed.
def axpby(self, sizea, sizeb, applier, repeat_tensors,
x_type, y_type, out_type, inplace=False):
self.overflow_buf.zero_()
t1 = torch.cuda.FloatTensor(sizea).fill_(1.0)
t2 = torch.cuda.FloatTensor(sizeb).fill_(1.0)
y_list = []
for i in range(repeat_tensors):
y_list += [t1.clone().to(y_type)*self.yval, t2.clone().to(y_type)*self.yval]
x_list = [x.clone().to(x_type)*(self.xval/self.yval) for x in y_list]
if inplace:
out_list = y_list
else:
out_list = [out.clone().to(out_type)*3.0 for out in y_list]
applier(multi_tensor_axpby, self.overflow_buf, [x_list, y_list, out_list], self.a, self.b, -1)
self.assertTrue(all([torch.allclose(out, self.ref.to(out_type)) for out in out_list]),
msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
x_type, y_type, out_type, inplace))
self.assertTrue(self.overflow_buf.item() == 0,
msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
x_type, y_type, out_type, inplace))
# def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
# self.overflow_buf.zero_()
# a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
# b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
# out_list = []
# for i in range(repeat_tensors):
# out_list += [a.clone().to(out_type), b.clone().to(out_type)]
# if inplace:
# in_list = out_list
# else:
# in_list = [out.clone().to(in_type) for out in out_list]
# applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
# self.overflow_buf.zero_()
# in_list[t][ind] = val
# applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
# self.assertTrue(self.overflow_buf.item())
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fuzz(self):
input_size_pairs = (
(7777*77, 555*555),
(777, 555),
(555, 2048*32+1),
(2048*32+1, 555),
(555, 2048*32),
(2048*32, 555),
(33333, 555),
(555, 33333))
appliers = (
MultiTensorApply(2048*32),
MultiTensorApply(333),
MultiTensorApply(33333))
repeat_tensors = (
1,
55)
for sizea, sizeb in input_size_pairs:
for applier in appliers:
for repeat in repeat_tensors:
for x_type in (torch.float32, torch.float16):
for y_type in (torch.float32, torch.float16):
for out_type in (torch.float32, torch.float16):
for inplace in (True, False):
if inplace is True and (y_type is not out_type):
continue
else:
self.axpby(sizea, sizeb, applier, repeat,
x_type, y_type, out_type, inplace=inplace)
# self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
# 0, 0, float('nan'), inplace=inplace)
# self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
# 2*repeat-1, sizeb-1, float('inf'), inplace=inplace)
# self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
# 2*(repeat//2), sizea//2, float('inf'), inplace=inplace)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment