Unverified Commit db92ee13 authored by Jithun Nair's avatar Jithun Nair Committed by GitHub
Browse files

Merge pull request #64 from ROCmSoftwarePlatform/IFU-master-2021-12-08

IFU-master-2021-12-08
parents d150afdc 68364b49
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template<typename T>
__device__ __forceinline__ bool is_aligned(T* p){
return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
}
template<typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}
template<typename x_t>
struct L2NormFunctor
{
__device__ __forceinline__ void operator()(
int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<1>& tl,
float* output,
float* output_per_tensor,
bool per_tensor,
int max_chunks_per_tensor)
{
if (*noop_gmem) {
return;
}
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
x_t* x = (x_t*)tl.addresses[0][tensor_loc];
x += chunk_idx*chunk_size;
n -= chunk_idx*chunk_size;
__shared__ float s_vals[512];
float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
x_t r_x[ILP];
for(int i = 0; i < ILP; i++)
{
vals[i] = 0.f;
r_x[i] = 0;
}
// to make things simple, we put aligned case in a different code path
if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
{
for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
{
// load
load_store(r_x, x, 0 , i_start);
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
float next = static_cast<float>(r_x[ii]);
vals[ii] += next*next;
}
}
}
else
{
for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
{
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
float next = static_cast<float>(x[i]);
vals[ii] += next*next;
}
}
}
}
float val = 0.f;
for(int i = 0; i < ILP; i++)
val += vals[i];
float final = reduce_block_into_lanes(s_vals, val);
if(threadIdx.x == 0)
{
if(!isfinite(final))
*noop_gmem = 1; // Blindly fire off a write. These will race but that's ok.
output[blockIdx.x] += final;
if(per_tensor)
output_per_tensor[(tl.start_tensor_this_launch + tensor_loc)*max_chunks_per_tensor + chunk_idx] = final;
}
}
};
__global__ void cleanup(
float* output,
float* output_per_tensor,
float* ret,
float* ret_per_tensor,
bool per_tensor,
int max_chunks_per_tensor,
volatile int* noop_gmem)
{
if (*noop_gmem) {
return;
}
__shared__ float vals[512];
if(blockIdx.x == 0)
{
float val = 0;
if(threadIdx.x < 320)
val = output[threadIdx.x];
float final = reduce_block_into_lanes(vals, val);
if(threadIdx.x == 0)
*ret = sqrt(final);
}
if(per_tensor)
{
float* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor;
float val = 0;
for(int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
val += output_this_tensor[i];
float final = reduce_block_into_lanes(vals, val);
if(threadIdx.x == 0)
ret_per_tensor[blockIdx.x] = sqrt(final);
}
}
std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_mp_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
at::optional<bool> per_tensor_python)
{
bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;
auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
auto output = at::zeros({320}, float_options);
at::Tensor output_per_tensor;
at::Tensor ret_per_tensor;
int ntensors = tensor_lists[0].size();
int max_chunks_per_tensor = -1;
if(per_tensor)
{
for(int t = 0; t < ntensors; t++)
{
int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
if(max_chunks_this_tensor > max_chunks_per_tensor)
max_chunks_per_tensor = max_chunks_this_tensor;
}
output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, float_options);
ret_per_tensor = at::empty({ntensors}, float_options);
}
else
{
ret_per_tensor = at::empty({0}, float_options);
}
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_mp_cuda",
multi_tensor_apply<1>(
BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
L2NormFunctor<scalar_t_0>(),
output.data_ptr<float>(),
per_tensor ? output_per_tensor.data_ptr<float>() : nullptr,
per_tensor,
max_chunks_per_tensor);)
AT_CUDA_CHECK(cudaGetLastError());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to end.
// I could get rid of these by hacking the functor + multi tensor harness with persistence
// logic, but keeping it simple for now
auto ret = at::empty({1}, output.options());
const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
auto stream = at::cuda::getCurrentCUDAStream();
cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
output.data_ptr<float>(),
per_tensor ? output_per_tensor.data_ptr<float>() : nullptr,
ret.data_ptr<float>(),
per_tensor ? ret_per_tensor.data_ptr<float>() : nullptr,
per_tensor,
max_chunks_per_tensor, noop_flag.data_ptr<int>());
return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
}
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template<typename T>
__device__ __forceinline__ bool is_aligned(T* p){
return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
}
template<typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}
typedef enum{
MOMENT_MODE_0 =0, // L2 regularization mode
MOMENT_MODE_1 =1 // Decoupled weight decay mode
} adamMode_t;
std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_mp_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
at::optional<bool> per_tensor_python);
using MATH_T = float;
template<typename T, typename param_t>
struct LAMBStage1Functor
{
__device__ __forceinline__ void operator()(
int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<4>& tl,
const float beta1,
const float beta2,
const float beta3,
const int* step_ptr,
const int bias_correction,
const float epsilon,
adamMode_t mode,
const float decay,
const float* global_grad_norm,
const float* max_global_grad_norm,
const float* found_inf,
const float* inv_scale)
{
if (*noop_gmem) {
return;
}
float beta1_correction = 1.0f;
float beta2_correction = 1.0f;
if (bias_correction == 1) {
int step = *step_ptr;
beta1_correction = 1 - std::pow(beta1, step);
beta2_correction = 1 - std::pow(beta2, step);
}
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
float clipped_global_grad_norm = (*global_grad_norm) > (*max_global_grad_norm) ? (*global_grad_norm) / (*max_global_grad_norm) : 1.0f;
T* g = (T*)tl.addresses[0][tensor_loc];
g += chunk_idx*chunk_size;
param_t* p = (param_t*)tl.addresses[1][tensor_loc];
p += chunk_idx*chunk_size;
param_t* m = (param_t*)tl.addresses[2][tensor_loc];
m += chunk_idx*chunk_size;
param_t* v = (param_t*)tl.addresses[3][tensor_loc];
v += chunk_idx*chunk_size;
n -= chunk_idx*chunk_size;
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
// to make things simple, we put aligned case in a different code path
if(n % ILP == 0 &&
chunk_size % ILP == 0 &&
is_aligned(g) &&
is_aligned(p) &&
is_aligned(m) &&
is_aligned(v))
{
T l_g[ILP];
param_t l_p[ILP];
param_t l_m[ILP];
param_t l_v[ILP];
for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
{
// load
load_store(l_g, g, 0, i_start);
if (decay != 0)
load_store(l_p, p, 0, i_start);
load_store(l_m, m, 0, i_start);
load_store(l_v, v, 0, i_start);
// unpack
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
r_g[ii] = l_g[ii] * (*inv_scale);
if (decay == 0) {
r_p[ii] = MATH_T(0);
}
else {
r_p[ii] = l_p[ii];
}
r_m[ii] = l_m[ii];
r_v[ii] = l_v[ii];
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
if (mode == MOMENT_MODE_0) {
MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
// L2 on scaled grad
scaled_grad = scaled_grad + decay*r_p[ii];
r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
r_p[ii] = next_m_unbiased / denom;
}
else {
MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
l_p[ii] = r_p[ii];
// Difference from APEX's LAMB kernel. `g` and `p` can be different dtypes.
l_g[ii] = r_p[ii];
l_m[ii] = r_m[ii];
l_v[ii] = r_v[ii];
}
// store
load_store(g, l_g, i_start, 0);
load_store(m, l_m, i_start, 0);
load_store(v, l_v, i_start, 0);
}
}
else
{
// see note in multi_tensor_scale_kernel.cu
for(int i_start = 0;
i_start < n && i_start < chunk_size;
i_start += blockDim.x*ILP)
{
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
r_g[ii] = g[i] * (*inv_scale);
// special ?optimization? for lamb stage 1
if (decay == 0) {
r_p[ii] = MATH_T(0);
}
else {
r_p[ii] = p[i];
}
r_m[ii] = m[i];
r_v[ii] = v[i];
} else {
r_g[ii] = MATH_T(0);
r_p[ii] = MATH_T(0);
r_m[ii] = MATH_T(0);
r_v[ii] = MATH_T(0);
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
if (mode == MOMENT_MODE_0) {
MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
// L2 on scaled grad
scaled_grad = scaled_grad + decay*r_p[ii];
r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
r_p[ii] = next_m_unbiased / denom;
}
else {
MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
g[i] = r_p[ii];
m[i] = r_m[ii];
v[i] = r_v[ii];
}
}
}
}
}
};
// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
// N == 2: FP32 params, no master params
// N == 3: FP16 params, FP32 master params.
template<typename T, int N, typename param_t>
struct LAMBStage2Functor
{
static_assert((N == 2 && std::is_same<T, param_t>::value) || (N == 3 && std::is_same<param_t, float>::value), "");
__device__ __forceinline__ void operator()(
int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<N>& tl,
const float* per_tensor_param_norm,
const float* per_tensor_update_norm,
const float* learning_rate,
const float decay,
bool use_nvlamb)
{
if (*noop_gmem) {
return;
}
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int tensor_num = tl.start_tensor_this_launch + tensor_loc;
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
MATH_T ratio = *learning_rate;
// nvlamb: apply adaptive learning rate to all parameters
// otherwise, only apply to those with non-zero weight decay
if (use_nvlamb || (decay != 0.0))
{
float param_norm = per_tensor_param_norm[tensor_num];
float update_norm = per_tensor_update_norm[tensor_num];
ratio = (update_norm != 0.0f && param_norm != 0.0f) ? *learning_rate * (param_norm / update_norm) : *learning_rate;
}
T* update = (T*)tl.addresses[0][tensor_loc];
update += chunk_idx*chunk_size;
param_t* p = (param_t*)tl.addresses[1][tensor_loc];
p += chunk_idx*chunk_size;
T* out_p;
if (N == 3) {
out_p = (T*)tl.addresses[2][tensor_loc];
out_p += chunk_idx*chunk_size;
}
n -= chunk_idx*chunk_size;
// to make things simple, we put aligned case in a different code path
bool can_use_aligned_path = n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(p) && is_aligned(update);
if (N == 3) {
can_use_aligned_path = can_use_aligned_path && is_aligned(out_p);
}
if(can_use_aligned_path)
{
param_t r_p[ILP];
T r_update[ILP];
T r_out_p[ILP];
for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
{
// load
load_store(r_p, p, 0, i_start);
load_store(r_update, update, 0, i_start);
if (N == 3) {
load_store(r_out_p, out_p, 0, i_start);
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
r_p[ii] = static_cast<MATH_T>(r_p[ii]) - (ratio * static_cast<MATH_T>(r_update[ii]));
if (N == 3) {
r_out_p[ii] = r_p[ii];
}
}
load_store(p, r_p, i_start, 0);
if (N == 3) {
load_store(out_p, r_out_p, i_start, 0);
}
}
}
else
{
for(int i_start = 0;
i_start < n && i_start < chunk_size;
i_start += blockDim.x*ILP)
{
MATH_T r_p[ILP];
MATH_T r_update[ILP];
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
r_p[ii] = p[i];
r_update[ii] = update[i];
}
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
}
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
p[i] = r_p[ii];
if (N == 3) {
out_p[i] = r_p[ii];
}
}
}
}
}
}
};
void multi_tensor_lamb_mp_cuda(
int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
at::Tensor lr,
const float beta1,
const float beta2,
const float epsilon,
at::Tensor step,
const int bias_correction,
const float weight_decay,
const int grad_averaging,
const int mode,
at::Tensor global_grad_norm,
at::Tensor max_grad_norm,
at::optional<bool> use_nvlamb_python,
at::Tensor found_inf,
at::Tensor inv_scale)
{
// n_tensors == 5: FP16 model params & FP32 master params
// n_tensors == 4: FP32 model params & NO FP32 master params
const auto n_tensors = tensor_lists.size();
assert(n_tensors == 4 || n_tensors == 5);
using namespace at;
bool use_nvlamb = use_nvlamb_python.has_value() ? use_nvlamb_python.value() : false;
// note(mkozuki): move bias handling below to functor
// Handle bias correction mode
// float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
// if (bias_correction == 1) {
// bias_correction1 = 1 - std::pow(beta1, step);
// bias_correction2 = 1 - std::pow(beta2, step);
// }
// Handle grad averaging mode
float beta3 = 1.0f;
if (grad_averaging == 1) beta3 = 1 - beta1;
std::vector<std::vector<at::Tensor>> stage1_tensor_lists(tensor_lists.begin(), tensor_lists.begin() + 4);
std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin()+1);
std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin()+1, tensor_lists.begin()+2);
// Compute per tensor param norm
auto param_norm_tuple = multi_tensor_l2norm_mp_cuda(chunk_size, noop_flag, param_list, true);
// We now in-place modify grad to store update before compute its norm
// Generally this is not a issue since people modify grad in step() method all the time
// We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
if (n_tensors == 4) {
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
multi_tensor_apply<4>(
BLOCK_SIZE,
chunk_size,
noop_flag,
stage1_tensor_lists,
LAMBStage1Functor<scalar_t_0, scalar_t_0>(),
beta1,
beta2,
beta3, // 1-beta1 or 1 depends on averaging mode
// bias_correction1,
// bias_correction2,
step.data_ptr<int>(),
bias_correction,
epsilon,
(adamMode_t) mode,
weight_decay,
global_grad_norm.data_ptr<float>(),
max_grad_norm.data_ptr<float>(),
found_inf.data_ptr<float>(),
inv_scale.data_ptr<float>()); )
} else {
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
multi_tensor_apply<4>(
BLOCK_SIZE,
chunk_size,
noop_flag,
stage1_tensor_lists,
LAMBStage1Functor<scalar_t_0, float>(),
beta1,
beta2,
beta3, // 1-beta1 or 1 depends on averaging mode
// bias_correction1,
// bias_correction2,
step.data_ptr<int>(),
bias_correction,
epsilon,
(adamMode_t) mode,
weight_decay,
global_grad_norm.data_ptr<float>(),
max_grad_norm.data_ptr<float>(),
found_inf.data_ptr<float>(),
inv_scale.data_ptr<float>()); )
}
// Compute update norms
auto update_norm_tuple = multi_tensor_l2norm_mp_cuda(chunk_size, noop_flag, grad_list, true);
std::vector<std::vector<at::Tensor>> grad_param_list(tensor_lists.begin(), tensor_lists.begin()+2);
if (n_tensors == 4) {
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
multi_tensor_apply<2>(
BLOCK_SIZE,
chunk_size,
noop_flag,
grad_param_list,
LAMBStage2Functor<scalar_t_0, 2, scalar_t_0>(),
std::get<1>(param_norm_tuple).data_ptr<float>(),
std::get<1>(update_norm_tuple).data_ptr<float>(),
lr.data_ptr<float>(),
weight_decay,
use_nvlamb); )
} else {
grad_param_list.push_back(tensor_lists[4]);
DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
multi_tensor_apply<3>(
BLOCK_SIZE,
chunk_size,
noop_flag,
grad_param_list,
LAMBStage2Functor<scalar_t_0, 3, float>(),
std::get<1>(param_norm_tuple).data_ptr<float>(),
std::get<1>(update_norm_tuple).data_ptr<float>(),
lr.data_ptr<float>(),
weight_decay,
use_nvlamb); )
}
AT_CUDA_CHECK(cudaGetLastError());
}
...@@ -197,13 +197,15 @@ if "--cuda_ext" in sys.argv: ...@@ -197,13 +197,15 @@ if "--cuda_ext" in sys.argv:
'csrc/multi_tensor_scale_kernel.cu', 'csrc/multi_tensor_scale_kernel.cu',
'csrc/multi_tensor_axpby_kernel.cu', 'csrc/multi_tensor_axpby_kernel.cu',
'csrc/multi_tensor_l2norm_kernel.cu', 'csrc/multi_tensor_l2norm_kernel.cu',
'csrc/multi_tensor_l2norm_kernel_mp.cu',
'csrc/multi_tensor_l2norm_scale_kernel.cu', 'csrc/multi_tensor_l2norm_scale_kernel.cu',
'csrc/multi_tensor_lamb_stage_1.cu', 'csrc/multi_tensor_lamb_stage_1.cu',
'csrc/multi_tensor_lamb_stage_2.cu', 'csrc/multi_tensor_lamb_stage_2.cu',
'csrc/multi_tensor_adam.cu', 'csrc/multi_tensor_adam.cu',
'csrc/multi_tensor_adagrad.cu', 'csrc/multi_tensor_adagrad.cu',
'csrc/multi_tensor_novograd.cu', 'csrc/multi_tensor_novograd.cu',
'csrc/multi_tensor_lamb.cu'], 'csrc/multi_tensor_lamb.cu',
'csrc/multi_tensor_lamb_mp.cu'],
include_dirs=[os.path.join(this_dir, 'csrc')], include_dirs=[os.path.join(this_dir, 'csrc')],
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc': nvcc_args_multi_tensor if not IS_ROCM_PYTORCH else hipcc_args_multi_tensor})) 'nvcc': nvcc_args_multi_tensor if not IS_ROCM_PYTORCH else hipcc_args_multi_tensor}))
...@@ -390,7 +392,11 @@ if "--fast_layer_norm" in sys.argv: ...@@ -390,7 +392,11 @@ if "--fast_layer_norm" in sys.argv:
'-gencode', 'arch=compute_70,code=sm_70', '-gencode', 'arch=compute_70,code=sm_70',
'-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF_CONVERSIONS__',
'-Iapex/contrib/csrc/layer_norm', '-U__CUDA_NO_BFLOAT16_OPERATORS__',
'-U__CUDA_NO_BFLOAT16_CONVERSIONS__',
'-U__CUDA_NO_BFLOAT162_OPERATORS__',
'-U__CUDA_NO_BFLOAT162_CONVERSIONS__',
'-I./apex/contrib/csrc/layer_norm/',
'--expt-relaxed-constexpr', '--expt-relaxed-constexpr',
'--expt-extended-lambda', '--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}, '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag},
......
...@@ -36,9 +36,75 @@ class TestFusedLayerNorm(unittest.TestCase): ...@@ -36,9 +36,75 @@ class TestFusedLayerNorm(unittest.TestCase):
class TestFusedLayerNormElemWise(TestFusedLayerNorm): class TestFusedLayerNormElemWise(TestFusedLayerNorm):
elementwise_affine = True
class TestFusedLayerNormElemWiseHalf(TestFusedLayerNormElemWise):
dtype = torch.half
def test_large_batch(self):
self.skipTest("Skip to save time")
class TestFusedLayerNormElemWiseBFloat16(TestFusedLayerNormElemWise):
dtype = torch.bfloat16
# NOTE (mkozuki): [BFloat16 Layer Norm flakiness]
# Use thresholds larger than those used in pytorch, see
# https://github.com/pytorch/pytorch/blob/72274e2a2fd55019ec860e1743dbdc5b0c5a5624/torch/testing/_asserts.py#L26
fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
def test_large_batch(self):
self.skipTest("Skip to save time")
def _prep_layers(normalized_shape, elementwise_affine, dtype):
native = torch.nn.LayerNorm(
normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
).to(device="cuda", dtype=dtype)
fused = apex.normalization.FusedLayerNorm(
normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
).cuda()
return native, fused
def _prep_inputs(batch_size, normalized_shape, dtype):
shape = (batch_size, *normalized_shape)
fused = torch.randn(shape).cuda().requires_grad_(True)
with torch.no_grad():
native = fused.clone().to(dtype).requires_grad_(True)
return native, fused
TORCH_MAJOR, TORCH_MINOR = int(torch.__version__.split('.')[0]), int(torch.__version__.split('.')[1])
if (TORCH_MAJOR <= 1 and TORCH_MINOR < 10):
autocast_dtypes = (torch.half,)
else:
autocast_dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
class TestAutocastFusedLayerNorm(unittest.TestCase):
bf16_fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
bf16_bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
def setUp(self): def setUp(self):
self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu() self.batch_size = 16
self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda() self.normalized_shape = [32, 16]
def _run_test(self, dtype, elementwise_affine):
native, fused = _prep_layers(self.normalized_shape, elementwise_affine, dtype)
native_x, fused_x = _prep_inputs(self.batch_size, self.normalized_shape, dtype)
expected = native(native_x)
with torch.cuda.amp.autocast(dtype=dtype):
actual = fused(fused_x)
tols = {'rtol': None, 'atol': None} if dtype == torch.half else TestAutocastFusedLayerNorm.bf16_fwd_thresholds
torch.testing.assert_allclose(actual, expected, **tols)
g_native = torch.rand_like(expected)
with torch.no_grad():
g_fused = g_native.clone()
expected.backward(g_native)
actual.backward(g_fused)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -144,14 +144,14 @@ class RefLAMB(Optimizer): ...@@ -144,14 +144,14 @@ class RefLAMB(Optimizer):
return loss return loss
class TestLamb(unittest.TestCase):
class TestFusedLAMB(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7): def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff self.max_rel_diff = max_rel_diff
self.iters = iters self.iters = iters
torch.cuda.manual_seed(9876) torch.cuda.manual_seed(9876)
def tearDown(self): def tearDown(self):
pass pass
...@@ -162,8 +162,8 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -162,8 +162,8 @@ class TestFusedLAMB(unittest.TestCase):
ref_param.append(torch.nn.Parameter(tensor.clone())) ref_param.append(torch.nn.Parameter(tensor.clone()))
tst_param.append(torch.nn.Parameter(tensor.clone())) tst_param.append(torch.nn.Parameter(tensor.clone()))
ref_optim = RefLAMB(ref_param, **lamb_option) ref_optim = self.ref_optim(ref_param, **lamb_option)
tst_optim = apex.optimizers.FusedLAMB(tst_param, use_nvlamb=True, **lamb_option) tst_optim = self.tst_optim(tst_param, use_nvlamb=True, **lamb_option)
return (ref_param, tst_param, ref_optim, tst_optim) return (ref_param, tst_param, ref_optim, tst_optim)
...@@ -211,6 +211,13 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -211,6 +211,13 @@ class TestFusedLAMB(unittest.TestCase):
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedLAMB(TestLamb):
def __init__(self, *args, **kwargs):
super(TestLamb, self).__init__(*args, **kwargs)
self.ref_optim = RefLAMB
self.tst_optim = apex.optimizers.FusedLAMB
def test_float(self): def test_float(self):
self.gen_single_type_test(param_type=torch.float) self.gen_single_type_test(param_type=torch.float)
...@@ -264,6 +271,65 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -264,6 +271,65 @@ class TestFusedLAMB(unittest.TestCase):
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedMixedPrecisionLamb(TestLamb):
def __init__(self, *args, **kwargs):
super(TestLamb, self).__init__(*args, **kwargs)
self.ref_optim = RefLAMB
self.tst_optim = apex.optimizers.FusedMixedPrecisionLamb
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
@unittest.skip("PyTorch optimizer is not numerically correct for fp16")
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
weight_decay = [0, 0.01]
for wd in weight_decay:
lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim(tensors, lamb_option)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_lamb_option(self):
nelem = 1
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
weight_decay = [0, 0.01]
for wd in weight_decay:
lamb_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06, 'weight_decay':wd}
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], lamb_option)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
if __name__ == '__main__': if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__)) script_path = os.path.dirname(os.path.realpath(__file__))
......
import random
import torch
from apex.transformer import tensor_parallel
from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import vocab_parallel_cross_entropy
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.schedules import get_forward_backward_func
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.testing.standalone_bert import bert_model_provider
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
mode = None
MANUAL_SEED = 42
inds = None
masks = None
data_idx = 0
MASK_PROB = 0.1
EASY_MODE = False
EASY_MODE_SIZ = 32
ONCE = False
# download a public domain book as corpus
def download_fancy_data():
#import requests
#response = requests.get('https://internet.com/book.txt')
#text = ' '.join(response.text.split())
text = """
An original sentence not subject to any license restrictions, copyright, or royalty payments. Nothing to see here. Commercial or non-commercial use. Research or non-research purposes. The quick brown fox jumps over the lazy dog. Lorem ipsum.
"""
text = text*1024
encoded = text.encode('ascii', 'replace')
ints = [int(encoded[i]) for i in range(len(encoded))]
return torch.tensor(ints)
# build a batch given sequence_len and batch size
def generate_fancy_data_labels(sequence_len, batch_size):
global data_idx
global inds
global masks
global MANUAL_SEED
temps = list()
for i in range(batch_size):
if inds is None or data_idx >= len(inds):
# hack as use of RNG will fall out of sync due to pipelines being different
torch.manual_seed(MANUAL_SEED)
inds = torch.randperm(effective_length, device='cuda')
masks = (torch.rand(len(inds)//batch_size + 1, batch_size, sequence_len, device='cuda') >= MASK_PROB).long()
MANUAL_SEED += 1
print("new epoch", len(inds))
data_idx = 0
print("my start", inds[0:5])
print("masks_checksum:", torch.sum(masks))
if EASY_MODE:
data_idx_ = data_idx % EASY_MODE_SIZ
else:
data_idx_ = data_idx
offset = inds[data_idx_] #* SEQUENCE_LEN
data_idx += 1
curr = fancy_data[offset:offset+sequence_len].clone().detach()
temps.append(curr)
temp = torch.stack(temps, dim=0).cuda()
mask = masks[data_idx//batch_size]
mask_not = torch.logical_not(mask)
data = mask * temp + mask_not*124
label = temp
return (data, label, mask_not)
easy_data = None
def fwd_step_func(batch, model):
data, label, loss_mask = batch
data = data.cuda()
label = label.cuda()
loss_mask = loss_mask.cuda()
y = model(data, torch.ones_like(data), lm_labels=label)
def loss_func(output_tensor):
global ONCE
output_tensor, _ = output_tensor
lm_loss_ = output_tensor.float()
lm_loss = torch.sum(
lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
averaged_loss = average_losses_across_data_parallel_group([lm_loss])
if data_idx >= 1536:
assert lm_loss < 4.8
if not ONCE:
print("LOSS OK")
ONCE = True
return lm_loss, {'avg': averaged_loss}
return y, loss_func
def train(model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size):
sequence_len = global_vars.get_args().seq_length
micro_batch_size = global_vars.get_args().micro_batch_size
hidden_size = global_vars.get_args().hidden_size
forward_backward_func = get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
for _ in range(8):
batch = generate_fancy_data_labels(sequence_len, batch_size)
optim.zero_grad()
forward_backward_func(fwd_step_func, batch, model, forward_only=False, tensor_shape=tensor_shape)
optim.step()
if __name__ == '__main__':
global fancy_data
global effective_length
global_vars.set_global_variables()
fancy_data = download_fancy_data()
effective_length = fancy_data.size(0) // global_vars.get_args().seq_length
effective_length = fancy_data.size(0) - global_vars.get_args().seq_length
initialize_distributed()
world_size = torch.distributed.get_world_size()
failure = None
try:
args = global_vars.get_args()
args.padded_vocab_size = 128 # needed in standalone gpt
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
virtual_pipeline_model_parallel_size = 2
world_size = torch.distributed.get_world_size()
pipeline_model_parallel_size = world_size
parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
tensor_parallel.random.model_parallel_cuda_manual_seed(0)
model = build_model(
bert_model_provider,
wrap_with_ddp=True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
)
assert isinstance(model, list)
assert len(model) == (1 if virtual_pipeline_model_parallel_size is None else virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
optim = torch.optim.Adam(_param_groups)
print(effective_length)
print(fancy_data.size(0))
train(model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
except Exception as e:
failure = str(e)
finally:
parallel_state.destroy_model_parallel()
if failure is not None:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(f"Minimal BERT Pipeline Parallel Failed with: {failure}")
else:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
# coding=utf-8 # coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -15,15 +15,15 @@ ...@@ -15,15 +15,15 @@
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from apex.transformer.tensor_parallel.tests.commons import set_random_seed
from apex.transformer.tensor_parallel.tests.commons import IdentityLayer
from apex.transformer.tensor_parallel.tests.commons import print_separator
from apex.transformer.tensor_parallel.tests.commons import initialize_distributed
from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer import tensor_parallel from apex.transformer import tensor_parallel
from apex.transformer.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy from apex.transformer.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
from apex.transformer.tensor_parallel.tests import global_vars from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import set_random_seed
from apex.transformer.testing.commons import IdentityLayer
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables() global_vars.set_global_variables()
...@@ -51,8 +51,11 @@ def tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scal ...@@ -51,8 +51,11 @@ def tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scal
logits_parallel = tensor_parallel.scatter_to_tensor_model_parallel_region(logits) logits_parallel = tensor_parallel.scatter_to_tensor_model_parallel_region(logits)
target = torch.cuda.LongTensor( target = torch.cuda.LongTensor(
size=(batch_size, seq_length)).random_(0, vocab_size) size=(batch_size, seq_length)).random_(0, vocab_size)
logits_parallel_ = logits_parallel.clone().detach()
loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
loss.backward() loss.backward()
# check for mutation
assert torch.equal(logits_parallel_, logits_parallel)
return loss, identity.weight.grad return loss, identity.weight.grad
...@@ -94,6 +97,8 @@ def test_cross_entropy(tensor_model_parallel_size): ...@@ -94,6 +97,8 @@ def test_cross_entropy(tensor_model_parallel_size):
if __name__ == '__main__': if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed() initialize_distributed()
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
......
# coding=utf-8 # coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -19,10 +19,10 @@ import torch ...@@ -19,10 +19,10 @@ import torch
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import data as data_utils from apex.transformer.tensor_parallel import data as data_utils
from apex.transformer.tensor_parallel.tests import global_vars from apex.transformer.testing import global_vars
from apex.transformer.tensor_parallel.tests.commons import print_separator from apex.transformer.testing.commons import print_separator
from apex.transformer.tensor_parallel.tests.commons import initialize_distributed from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables() global_vars.set_global_variables()
...@@ -82,6 +82,8 @@ def test_broadcast_data(tensor_model_parallel_size): ...@@ -82,6 +82,8 @@ def test_broadcast_data(tensor_model_parallel_size):
if __name__ == '__main__': if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed() initialize_distributed()
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
......
from typing import Tuple, List
import torch
from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel.utils import get_num_microbatches
from apex.transformer.pipeline_parallel.schedules.common import (
_get_params_for_weight_decay_optimization,
)
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import (
_forward_backward_pipelining_with_interleaving,
)
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.log_util import get_transformer_logger, set_logging_level
from apex.transformer.testing.commons import model_provider_func
from apex.transformer._data import MegatronPretrainingRandomSampler
from apex.transformer._data import MegatronPretrainingSampler
# note(mkozuki): To see warmup, steady, cooldown iterations, uncomment the line below
# set_logging_level("INFO")
_logger = get_transformer_logger("pipeline_parallel_test")
# note(mkozuki): To see if local batch size increases, uncomment the line below
# _logger.setLevel("INFO")
global_vars.set_global_variables(
args_defaults={"global_batch_size": 512, "rampup_batch_size": [32, 32, 1000],},
ignore_unknown_args=True,
)
RAMPUP_BATCH_SIZE = []
NUM_ITERATIONS = 20
NUM_SAMPLES = 16384 // 2
batch_size, micro_batch_size = None, None
HIDDEN_SIZE = 16
def Dataset(num_samples: int) -> List[Tuple[torch.Tensor, torch.Tensor]]:
return [(torch.randn(HIDDEN_SIZE), torch.randn(HIDDEN_SIZE // 2)) for _ in range(num_samples)]
def process_batch(batch):
if isinstance(batch, (list, tuple)):
x = batch[0]
else:
x = batch
return x
def fwd_step_func(micro_batch, model):
x = process_batch(micro_batch)
y = model(x)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def loss_func(x):
loss = torch.sum(x)
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {"avg": averaged_loss}
return y, loss_func
# Run forward & backward with dynamic batch size.
def run_interleaved_with_dynamic_batch_size(
pipeline_model_parallel_size: int, forward_only: bool, BatchSamplerCls,
) -> None:
args = global_vars.get_args()
_reconfigure_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
virtual_pipeline_model_parallel_size = 2
# NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is a requisite for the interleaving scheduling
# In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and
# used ubiquitously but this test uses custom model so it's safe to abuse.
parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size
)
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
print_separator(f"BatchSamplerCls: {BatchSamplerCls.__name__}, forward_only: {forward_only}")
model = build_model(
model_provider_func,
wrap_with_ddp=True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
hidden_size=HIDDEN_SIZE,
)
assert isinstance(model, list)
assert len(model) == virtual_pipeline_model_parallel_size
optimizer = torch.optim.Adam(_get_params_for_weight_decay_optimization(model))
initial_local_minibatch_size = get_num_microbatches() * micro_batch_size
dataset = Dataset(NUM_SAMPLES)
data_loader = torch.utils.data.DataLoader(
dataset,
batch_sampler=BatchSamplerCls(
NUM_SAMPLES,
0,
initial_local_minibatch_size,
parallel_state.get_data_parallel_rank(),
parallel_state.get_data_parallel_world_size(),
),
)
data_iter = iter(data_loader)
def get_num_samples(batch):
if isinstance(batch, torch.Tensor):
return len(batch)
assert isinstance(batch, (list, tuple))
return [get_num_samples(b) for b in batch]
tensor_shape = [micro_batch_size, HIDDEN_SIZE]
consumed_samples = 0
for i in range(NUM_ITERATIONS):
update_num_microbatches(consumed_samples, consistency_check=False)
local_batch_size = get_num_microbatches() * micro_batch_size
data_iter._index_sampler.local_minibatch_size = local_batch_size
local_mini_batch = next(data_iter)
_logger.info(
f"iter: {i} / {NUM_ITERATIONS} "
f"local batchsize: {get_num_samples(local_mini_batch)} "
f"consumed_samples: {consumed_samples} / {NUM_SAMPLES}"
)
_forward_backward_pipelining_with_interleaving(
fwd_step_func,
local_mini_batch,
model,
forward_only=forward_only,
tensor_shape=tensor_shape,
)
consumed_samples += (
parallel_state.get_data_parallel_world_size()
* get_num_microbatches()
* micro_batch_size
)
if not forward_only:
for m in model:
for p in m.parameters():
if p.grad is None:
raise RuntimeError("grad not found")
else:
optimizer.zero_grad(set_to_none=True)
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
n_tests = 0
failures = []
initialize_distributed()
world_size = torch.distributed.get_world_size()
args = global_vars.get_args()
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
for BatchSamplerCls in (MegatronPretrainingSampler, MegatronPretrainingRandomSampler):
for forward_only in (False, True):
n_tests += 1
pipeline_model_parallel_size = world_size
try:
run_interleaved_with_dynamic_batch_size(
pipeline_model_parallel_size, forward_only, BatchSamplerCls,
)
except Exception as e:
msg = (
f"\tforward_only: {forward_only}\n"
f"pipeline rank: {parallel_state.get_pipeline_model_parallel_rank()}, "
f"virtual pipeline rank: {parallel_state.get_virtual_pipeline_model_parallel_rank()}\n"
f"{str(e)}"
)
raise RuntimeError(msg)
finally:
parallel_state.destroy_model_parallel()
print_separator("TEST RESULT")
if failures:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("\n".join(failures))
msg = f"{len(failures)} / {n_tests} cases failed"
raise RuntimeError(msg)
else:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("### PASS!")
# coding=utf-8 # coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
import torch import torch
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer.tensor_parallel.tests import global_vars from apex.transformer.testing import global_vars
from apex.transformer.tensor_parallel.tests.commons import print_separator from apex.transformer.testing.commons import print_separator
from apex.transformer.tensor_parallel.tests.commons import initialize_distributed from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables() global_vars.set_global_variables()
...@@ -90,6 +90,8 @@ def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): ...@@ -90,6 +90,8 @@ def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
if __name__ == '__main__': if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed() initialize_distributed()
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
......
# coding=utf-8 # coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -18,11 +18,11 @@ from torch.nn.parameter import Parameter ...@@ -18,11 +18,11 @@ from torch.nn.parameter import Parameter
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import layers from apex.transformer.tensor_parallel import layers
from apex.transformer.tensor_parallel.tests import global_vars from apex.transformer.testing import global_vars
from apex.transformer.tensor_parallel.tests.commons import set_random_seed from apex.transformer.testing.commons import set_random_seed
from apex.transformer.tensor_parallel.tests.commons import print_separator from apex.transformer.testing.commons import print_separator
from apex.transformer.tensor_parallel.tests.commons import initialize_distributed from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables() global_vars.set_global_variables()
...@@ -584,7 +584,6 @@ def test_parallel_transformer_layer(tensor_model_parallel_size): ...@@ -584,7 +584,6 @@ def test_parallel_transformer_layer(tensor_model_parallel_size):
if __name__ == '__main__': if __name__ == '__main__':
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False torch.backends.cudnn.benchmark = False
......
import torch import torch
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer.tensor_parallel.tests.commons import initialize_distributed
from apex.transformer.tensor_parallel import mappings from apex.transformer.tensor_parallel import mappings
from apex.transformer.tensor_parallel.tests import global_vars from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import initialize_distributed
global_vars.set_global_variables() global_vars.set_global_variables()
...@@ -48,6 +48,8 @@ def test__gather(args, tensor_model_parallel_size): ...@@ -48,6 +48,8 @@ def test__gather(args, tensor_model_parallel_size):
if __name__ == "__main__": if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed() initialize_distributed()
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
......
from functools import partial
import logging
from typing import List
import torch
from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import _forward_backward_pipelining_with_interleaving
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import forward_backward_pipelining_without_interleaving
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import get_ltor_masks_and_position_ids
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
from apex.transformer.tensor_parallel import model_parallel_cuda_manual_seed
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.standalone_gpt import gpt_model_provider
from apex.transformer.log_util import get_transformer_logger, set_logging_level
set_logging_level(logging.NOTSET)
_logger = get_transformer_logger("megatron_gpt_pipeline_test")
global_vars.set_global_variables()
N_VOCAB = 8192
def generate_batch(batch_size, sequence_length):
size = batch_size, sequence_length + 1
int_tensor = torch.randint(low=0, high=N_VOCAB, size=size, dtype=torch.long).cuda()
return int_tensor,
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L44
def get_batch(int_tensors: List[torch.Tensor]):
data = int_tensors[0]
# Unpack.
tokens_ = data.long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and position ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
N_VOCAB, # tokenizer.eod,
False, # args.reset_position_ids,
False, # args.reset_attention_mask,
False, # args.eod_mask_loss,
)
return tokens, labels, loss_mask, attention_mask, position_ids
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L75
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'lm loss': averaged_loss[0]}
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L86
# TODO (mkozuki): Currently I'm seeing no attribute `word_embeddings` which looks weird.
def forward_step(batch, model):
"""Forward step."""
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(batch)
output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def run_gpt(pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=None, forward_only=False):
parallel_state.initialize_model_parallel(1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
model_parallel_cuda_manual_seed(42)
model = build_model(
gpt_model_provider, True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size)
_logger.debug("building model")
assert isinstance(model, list)
assert len(model) == (1 or virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
torch.optim.Adam(_param_groups)
if parallel_state.is_pipeline_last_stage():
_logger.debug("checking `word_embeddings` existence")
for m in model:
assert hasattr(m, "word_embeddings")
args = global_vars.get_args()
if virtual_pipeline_model_parallel_size is None:
batch = generate_batch(args.global_batch_size, args.seq_length)
else:
batch = [generate_batch(args.global_batch_size, args.seq_length) for _ in range(virtual_pipeline_model_parallel_size)]
_logger.debug("preparing batch")
if virtual_pipeline_model_parallel_size is None:
fwd_bwd_func = forward_backward_pipelining_without_interleaving
else:
fwd_bwd_func = _forward_backward_pipelining_with_interleaving
_logger.debug(f"selecting forward_backward func: {fwd_bwd_func}")
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
_logger.debug(f"`tensor_shape`: {tensor_shape}")
fwd_bwd_func(forward_step, batch, model, forward_only=forward_only, tensor_shape=tensor_shape)
_logger.debug(TEST_SUCCESS_MESSAGE)
if __name__ == "__main__":
initialize_distributed()
args = global_vars.get_args()
args.padded_vocab_size = N_VOCAB
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
update_num_microbatches(0, True)
print_separator("run GPT model")
try:
run_gpt(torch.distributed.get_world_size())
# TODO(mkozuki): handle exception correctly, but for now, lazily commenting out as
# this won't get kicked by CI
except Exception as e:
_logger.debug(str(e))
pass
finally:
parallel_state.destroy_model_parallel()
from typing import Optional, Union, List
import torch
import torch.nn as nn
import apex
from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel import get_forward_backward_func
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import forward_backward_no_pipelining
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import _forward_backward_pipelining_with_interleaving
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import forward_backward_pipelining_without_interleaving
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.log_util import get_transformer_logger, set_logging_level
# set_logging_level("INFO")
_logger = get_transformer_logger("pipeline_parallel_test")
global_vars.set_global_variables()
batch_size, micro_batch_size = None, None
hidden_size = 16
fwd_bwd_functions = {
"no_pipelining": forward_backward_no_pipelining,
"no_interleaving": forward_backward_pipelining_without_interleaving,
"interleaving": _forward_backward_pipelining_with_interleaving,
}
# note (mkozuki): `pre_process` and `post_process` are a placeholder until interleaving schedule test comes.
class MyLayer(nn.Module):
def __init__(self, pre_process: bool, post_process: bool):
super().__init__()
self.pre_process = pre_process
self.post_process = post_process
self.layer = nn.Linear(hidden_size, hidden_size)
def forward(self, x):
return self.layer(x)
class MyModel(nn.Module):
def __init__(self, pre_process: bool = False, post_process: bool = False) -> None:
super().__init__()
self.pre_process = pre_process
self.post_process = post_process
self.layer = MyLayer(pre_process=pre_process, post_process=post_process)
self.input_tensor = None
def set_input_tensor(self, input_tensor: Union[torch.Tensor, List[torch.Tensor]]) -> None:
self.input_tensor = input_tensor
def forward(self, x: Optional[torch.Tensor]) -> torch.Tensor:
if self.input_tensor is None:
return self.layer(x)
return self.layer(self.input_tensor)
def model_provider_func(pre_process, post_process) -> MyModel:
return MyModel(pre_process, post_process)
def process_batch(batch):
if isinstance(batch, list):
x = batch[0]
else:
x = batch
return x
def fwd_step_func(batch, model):
x = process_batch(batch)
y = model(x)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def loss_func(x):
loss = torch.sum(x)
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'avg': averaged_loss}
return y, loss_func
# TODO (mkozuki): Add a case with `autocast` and `GradScaler`.
# Run forward & backward for one minibatch.
def forward_backward_func_template(
name: str,
forward_backward_func,
pipeline_model_parallel_size: int,
forward_only: bool,
) -> None:
print_separator(f"name: {name}, pipeline model parallel size: {pipeline_model_parallel_size}")
virtual_pipeline_model_parallel_size = 2 if name == "interleaving" else None
if name == "no_pipelining":
# note (mkozuki): `forward_backward_no_pipelining` is **NOTE** compatible with
# pipeline_model_parallel_size>1. So use pipeline_model_parallel_size as
# tensor_model_parallel_size and set pipeline_model_parallel_size to 1.
parallel_state.initialize_model_parallel(1, 1, None)
else:
# NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is necessary to enable interleaving scheduling
# In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and
# used ubiquitously but this test uses custom model so it's safe to abuse.
parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
if virtual_pipeline_model_parallel_size is not None:
# Check the experimental warning message
get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
model = build_model(
model_provider_func,
wrap_with_ddp=True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
)
assert isinstance(model, list)
assert len(model) == (1 if virtual_pipeline_model_parallel_size is None else virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
torch.optim.Adam(_param_groups, lr=1e-4)
tensor_shape = [batch_size // parallel_state.get_data_parallel_world_size(), hidden_size]
batch = (torch.randn(tensor_shape).cuda(),)
tensor_shape[0] = micro_batch_size
update_num_microbatches(0)
forward_backward_func(
fwd_step_func, batch, model, forward_only=forward_only, tensor_shape=tensor_shape)
if not forward_only:
for m in model:
for p in m.parameters():
if p.grad is None:
raise RuntimeError("grad not found")
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
n_tests = 0
failures = []
initialize_distributed()
world_size = torch.distributed.get_world_size()
args = global_vars.get_args()
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
for forward_only in (True, False):
for name, forward_backward_func in fwd_bwd_functions.items():
n_tests += 1
# TODO (mkozuki): Test with data parallel size > 1.
pipeline_model_parallel_size = world_size
try:
forward_backward_func_template(
name,
forward_backward_func,
pipeline_model_parallel_size,
forward_only,
)
except Exception as e:
failures.append(
f"\t# {name} failed with pipeline size: {pipeline_model_parallel_size} "
f"and forward_only: {forward_only}\n"
f"pipeline rank: {parallel_state.get_pipeline_model_parallel_rank()}, "
f"virtual pipeline rank: {parallel_state.get_virtual_pipeline_model_parallel_rank()}\n"
f"{str(e)}"
)
finally:
parallel_state.destroy_model_parallel()
else:
print_separator(f"{name} works")
print_separator("TEST RESULT")
if failures:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("\n".join(failures))
msg = f"{len(failures)} / {n_tests} cases failed"
raise RuntimeError(msg)
else:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("### PASS!")
# coding=utf-8 # coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -16,10 +16,10 @@ import torch ...@@ -16,10 +16,10 @@ import torch
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer import tensor_parallel from apex.transformer import tensor_parallel
from apex.transformer.tensor_parallel.tests import global_vars from apex.transformer.testing import global_vars
from apex.transformer.tensor_parallel.tests.commons import print_separator from apex.transformer.testing.commons import print_separator
from apex.transformer.tensor_parallel.tests.commons import initialize_distributed from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables() global_vars.set_global_variables()
...@@ -188,6 +188,8 @@ def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size): ...@@ -188,6 +188,8 @@ def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
if __name__ == '__main__': if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed() initialize_distributed()
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
......
...@@ -15,6 +15,8 @@ def test_split_tensor_along_last_dim(): ...@@ -15,6 +15,8 @@ def test_split_tensor_along_last_dim():
if __name__ == "__main__": if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
test_divide() test_divide()
test_split_tensor_along_last_dim() test_split_tensor_along_last_dim()
print(">> passed the test :-)") print(">> passed the test :-)")
from itertools import product
import unittest
import torch
from torch.utils.data import Dataset
from torch.utils.data import RandomSampler
from torch.utils.data import BatchSampler
from torch.utils.data import DataLoader
from apex.transformer.pipeline_parallel.utils import _split_batch_into_microbatch as split_batch_into_microbatch
class MyIterableDataset(Dataset):
def __init__(self, start, end):
super().__init__()
assert end > start, "this example code only works with end >= start"
self.start = start
self.end = end
self.samples = list(range(self.start, self.end))
def __iter__(self):
return iter(range(self.start, self.end))
def __getitem__(self, index):
return self.samples[index]
class MegatronPretrainingRandomSampler:
def __init__(self, total_samples, consumed_samples, micro_batch_size,
data_parallel_rank, data_parallel_size):
# Keep a copy of input params for later use.
self.total_samples = total_samples
self.consumed_samples = consumed_samples
self.micro_batch_size = micro_batch_size
self.data_parallel_rank = data_parallel_rank
self.data_parallel_size = data_parallel_size
self.micro_batch_times_data_parallel_size = \
self.micro_batch_size * data_parallel_size
self.last_batch_size = \
self.total_samples % self.micro_batch_times_data_parallel_size
# Sanity checks.
assert self.total_samples > 0, \
'no sample to consume: {}'.format(self.total_samples)
assert self.micro_batch_size > 0
assert data_parallel_size > 0
assert self.data_parallel_rank < data_parallel_size, \
'data_parallel_rank should be smaller than data size: {}, ' \
'{}'.format(self.data_parallel_rank, data_parallel_size)
def __len__(self):
return self.total_samples
def __iter__(self):
active_total_samples = self.total_samples - self.last_batch_size
self.epoch = self.consumed_samples // active_total_samples
current_epoch_samples = self.consumed_samples % active_total_samples
assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0
# data sharding and random sampling
bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) * self.micro_batch_size
bucket_offset = current_epoch_samples // self.data_parallel_size
start_idx = self.data_parallel_rank * bucket_size
g = torch.Generator()
g.manual_seed(self.epoch)
random_idx = torch.randperm(bucket_size, generator=g).tolist()
idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
batch = []
# Last batch if not complete will be dropped.
for idx in idx_range:
batch.append(idx)
if len(batch) == self.micro_batch_size:
self.consumed_samples += self.micro_batch_times_data_parallel_size
yield batch
batch = []
# Samples 8 tensors in total.
# First sample 4 tensors twice, then sample 2 tensors fourth.
class TestBatchSamplerBehavior(unittest.TestCase):
def test_batch_sampler_behavior(self):
dataset = MyIterableDataset(0, 100)
for num_workers in (1, 2, 4):
with self.subTest(f"{num_workers}"):
torch.manual_seed(42)
loader = DataLoader(dataset, batch_sampler=MegatronPretrainingRandomSampler(100, 0, 4, 0, 1), num_workers=num_workers)
samples = []
for i, batch in enumerate(loader):
samples.append(batch)
if i == 2 - 1:
break
torch.manual_seed(42)
loader = DataLoader(dataset, batch_sampler=MegatronPretrainingRandomSampler(100, 0, 2, 0, 1), num_workers=num_workers)
samples2 = []
for i, batch in enumerate(loader):
samples2.append(batch)
if i == 4 - 1:
break
torch.testing.assert_allclose(torch.cat(samples), torch.cat(samples2))
def test_split_batch(self):
class MyIterableDataset(Dataset):
def __init__(self, start, end):
super().__init__()
assert end > start, "this example code only works with end >= start"
self.start = start
self.end = end
self.samples = list(range(self.start, self.end))
def __len__(self):
return self.end - self.start
def __iter__(self):
return iter(range(self.start, self.end))
def __getitem__(self, index):
return (torch.tensor([index, index]), torch.tensor([index // 2, index // 2]))
dataset = MyIterableDataset(0, 100)
torch.manual_seed(42)
global_batch_size = 16
loader = DataLoader(dataset, batch_sampler=MegatronPretrainingRandomSampler(100, 0, global_batch_size, 0, 1), num_workers=2)
batch = next(iter(loader))
# samples = None
# for i, batch in enumerate(loader):
# # samples = batch
# if i == 0:
# break
for _micro_batch_size in (1, 2, 4, 8):
microbatches = list(split_batch_into_microbatch(
batch,
_micro_batch_size=_micro_batch_size,
_global_batch_size=global_batch_size,
))
# print(batch)
# print(microbatches)
self.assertEqual(len(microbatches), global_batch_size // _micro_batch_size)
self.assertEqual(len(microbatches[0][0]), _micro_batch_size)
if __name__ == "__main__":
unittest.main()
from typing import Tuple
import os import os
import subprocess import subprocess
import sys import sys
import unittest import unittest
def run_mpu_tests(): DENY_TEST = [
"megatron_gpt_pipeline",
]
MULTIGPU_TEST = [
"pipeline_parallel_test",
"dynamic_batchsize_test",
]
SEVERALGPU_TEST = [
"bert_minimal_test",
]
def get_multigpu_launch_option(min_gpu):
should_skip = False
import torch
num_devices = torch.cuda.device_count()
if num_devices < min_gpu:
should_skip = True
distributed_run_options = f"-m torch.distributed.run --nproc_per_node={num_devices}"
return should_skip, distributed_run_options
def get_launch_option(test_filename) -> Tuple[bool, str]:
should_skip = False
for multigpu_test in MULTIGPU_TEST:
if multigpu_test in test_filename:
return get_multigpu_launch_option(2)
for severalgpu_test in SEVERALGPU_TEST:
if severalgpu_test in test_filename:
return get_multigpu_launch_option(3)
return should_skip, ""
def run_transformer_tests():
python_executable_path = sys.executable python_executable_path = sys.executable
# repository_root = os.path.join(os.path.dirname(__file__), "../../../") # repository_root = os.path.join(os.path.dirname(__file__), "../../../")
# directory = os.path.abspath(os.path.join(repository_root, "tests/mpu")) # directory = os.path.abspath(os.path.join(repository_root, "tests/mpu"))
...@@ -19,7 +51,28 @@ def run_mpu_tests(): ...@@ -19,7 +51,28 @@ def run_mpu_tests():
print("#######################################################") print("#######################################################")
errors = [] errors = []
for i, test_file in enumerate(files, 1): for i, test_file in enumerate(files, 1):
test_run_cmd = f"NVIDIA_TF32_OVERRIDE=0 {python_executable_path} {test_file} --micro-batch-size 2 --num-layers 1 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 32 --encoder-seq-length 32 --use-cpu-initialization" # NOQA is_denied = False
for deny_file in DENY_TEST:
if deny_file in test_file:
is_denied = True
if is_denied:
print(f"### {i} / {len(files)}: {test_file} skipped")
continue
should_skip, launch_option = get_launch_option(test_file)
if should_skip:
print(f"### {i} / {len(files)}: {test_file} skipped. Requires multiple GPUs.")
continue
test_run_cmd = (
f"{python_executable_path} {launch_option} {test_file} "
"--micro-batch-size 4 --num-layers 16 --hidden-size 768 --num-attention-heads 8 --max-position-embeddings "
"512 --seq-length 512 --global-batch-size 256"
)
if 'bert' in test_file:
import torch
num_devices = torch.cuda.device_count()
test_run_cmd += f" --pipeline-model-parallel-size {num_devices}"
else:
test_run_cmd += f" --use-cpu-initialization"
print(f"### {i} / {len(files)}: cmd: {test_run_cmd}") print(f"### {i} / {len(files)}: cmd: {test_run_cmd}")
try: try:
output = subprocess.check_output( output = subprocess.check_output(
...@@ -29,7 +82,7 @@ def run_mpu_tests(): ...@@ -29,7 +82,7 @@ def run_mpu_tests():
errors.append((test_file, str(e))) errors.append((test_file, str(e)))
else: else:
if '>> passed the test :-)' not in output: if '>> passed the test :-)' not in output:
errors.append(test_file, output) errors.append((test_file, output))
else: else:
if not errors: if not errors:
print("### PASSED") print("### PASSED")
...@@ -42,10 +95,10 @@ def run_mpu_tests(): ...@@ -42,10 +95,10 @@ def run_mpu_tests():
raise RuntimeError(short_msg) raise RuntimeError(short_msg)
class TestMPU(unittest.TestCase): class TestTransformer(unittest.TestCase):
def test_mpu(self): def test_transformer(self):
run_mpu_tests() run_transformer_tests()
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment