Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
db92ee13
Unverified
Commit
db92ee13
authored
Dec 14, 2021
by
Jithun Nair
Committed by
GitHub
Dec 14, 2021
Browse files
Merge pull request #64 from ROCmSoftwarePlatform/IFU-master-2021-12-08
IFU-master-2021-12-08
parents
d150afdc
68364b49
Changes
98
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
1873 additions
and
39 deletions
+1873
-39
csrc/multi_tensor_l2norm_kernel_mp.cu
csrc/multi_tensor_l2norm_kernel_mp.cu
+216
-0
csrc/multi_tensor_lamb_mp.cu
csrc/multi_tensor_lamb_mp.cu
+496
-0
setup.py
setup.py
+8
-2
tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
+68
-2
tests/L0/run_optimizers/test_lamb.py
tests/L0/run_optimizers/test_lamb.py
+70
-4
tests/L0/run_transformer/run_bert_minimal_test.py
tests/L0/run_transformer/run_bert_minimal_test.py
+170
-0
tests/L0/run_transformer/run_cross_entropy_test.py
tests/L0/run_transformer/run_cross_entropy_test.py
+12
-7
tests/L0/run_transformer/run_data_test.py
tests/L0/run_transformer/run_data_test.py
+7
-5
tests/L0/run_transformer/run_dynamic_batchsize_test.py
tests/L0/run_transformer/run_dynamic_batchsize_test.py
+211
-0
tests/L0/run_transformer/run_initialize_test.py
tests/L0/run_transformer/run_initialize_test.py
+7
-5
tests/L0/run_transformer/run_layers_test.py
tests/L0/run_transformer/run_layers_test.py
+6
-7
tests/L0/run_transformer/run_mappings_test.py
tests/L0/run_transformer/run_mappings_test.py
+4
-2
tests/L0/run_transformer/run_megatron_gpt_pipeline.py
tests/L0/run_transformer/run_megatron_gpt_pipeline.py
+135
-0
tests/L0/run_transformer/run_pipeline_parallel_test.py
tests/L0/run_transformer/run_pipeline_parallel_test.py
+200
-0
tests/L0/run_transformer/run_random_test.py
tests/L0/run_transformer/run_random_test.py
+7
-5
tests/L0/run_transformer/run_utils_test.py
tests/L0/run_transformer/run_utils_test.py
+2
-0
tests/L0/run_transformer/test_batch_sampler.py
tests/L0/run_transformer/test_batch_sampler.py
+149
-0
tests/L0/run_transformer/test_transformer_module.py
tests/L0/run_transformer/test_transformer_module.py
+105
-0
No files found.
csrc/multi_tensor_l2norm_kernel_mp.cu
0 → 100644
View file @
db92ee13
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
struct
L2NormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
1
>&
tl
,
float
*
output
,
float
*
output_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
)
{
if
(
*
noop_gmem
)
{
return
;
}
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
x_t
*
x
=
(
x_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
x
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
r_x
[
ii
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
val
+=
vals
[
i
];
float
final
=
reduce_block_into_lanes
(
s_vals
,
val
);
if
(
threadIdx
.
x
==
0
)
{
if
(
!
isfinite
(
final
))
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
output
[
blockIdx
.
x
]
+=
final
;
if
(
per_tensor
)
output_per_tensor
[(
tl
.
start_tensor_this_launch
+
tensor_loc
)
*
max_chunks_per_tensor
+
chunk_idx
]
=
final
;
}
}
};
__global__
void
cleanup
(
float
*
output
,
float
*
output_per_tensor
,
float
*
ret
,
float
*
ret_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
,
volatile
int
*
noop_gmem
)
{
if
(
*
noop_gmem
)
{
return
;
}
__shared__
float
vals
[
512
];
if
(
blockIdx
.
x
==
0
)
{
float
val
=
0
;
if
(
threadIdx
.
x
<
320
)
val
=
output
[
threadIdx
.
x
];
float
final
=
reduce_block_into_lanes
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
*
ret
=
sqrt
(
final
);
}
if
(
per_tensor
)
{
float
*
output_this_tensor
=
output_per_tensor
+
blockIdx
.
x
*
max_chunks_per_tensor
;
float
val
=
0
;
for
(
int
i
=
threadIdx
.
x
;
i
<
max_chunks_per_tensor
;
i
+=
blockDim
.
x
)
val
+=
output_this_tensor
[
i
];
float
final
=
reduce_block_into_lanes
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
ret_per_tensor
[
blockIdx
.
x
]
=
sqrt
(
final
);
}
}
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
multi_tensor_l2norm_mp_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
optional
<
bool
>
per_tensor_python
)
{
bool
per_tensor
=
per_tensor_python
.
has_value
()
?
per_tensor_python
.
value
()
:
false
;
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
auto
output
=
at
::
zeros
({
320
},
float_options
);
at
::
Tensor
output_per_tensor
;
at
::
Tensor
ret_per_tensor
;
int
ntensors
=
tensor_lists
[
0
].
size
();
int
max_chunks_per_tensor
=
-
1
;
if
(
per_tensor
)
{
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
int
max_chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
if
(
max_chunks_this_tensor
>
max_chunks_per_tensor
)
max_chunks_per_tensor
=
max_chunks_this_tensor
;
}
output_per_tensor
=
at
::
zeros
({
ntensors
*
max_chunks_per_tensor
},
float_options
);
ret_per_tensor
=
at
::
empty
({
ntensors
},
float_options
);
}
else
{
ret_per_tensor
=
at
::
empty
({
0
},
float_options
);
}
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"multi_tensor_l2norm_mp_cuda"
,
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
L2NormFunctor
<
scalar_t_0
>
(),
output
.
data_ptr
<
float
>
(),
per_tensor
?
output_per_tensor
.
data_ptr
<
float
>
()
:
nullptr
,
per_tensor
,
max_chunks_per_tensor
);)
AT_CUDA_CHECK
(
cudaGetLastError
());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to end.
// I could get rid of these by hacking the functor + multi tensor harness with persistence
// logic, but keeping it simple for now
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cleanup
<<<
per_tensor
?
ntensors
:
1
,
512
,
0
,
stream
>>>
(
output
.
data_ptr
<
float
>
(),
per_tensor
?
output_per_tensor
.
data_ptr
<
float
>
()
:
nullptr
,
ret
.
data_ptr
<
float
>
(),
per_tensor
?
ret_per_tensor
.
data_ptr
<
float
>
()
:
nullptr
,
per_tensor
,
max_chunks_per_tensor
,
noop_flag
.
data_ptr
<
int
>
());
return
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
(
ret
,
ret_per_tensor
);
}
csrc/multi_tensor_lamb_mp.cu
0 → 100644
View file @
db92ee13
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
typedef
enum
{
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
}
adamMode_t
;
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
multi_tensor_l2norm_mp_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
optional
<
bool
>
per_tensor_python
);
using
MATH_T
=
float
;
template
<
typename
T
,
typename
param_t
>
struct
LAMBStage1Functor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
4
>&
tl
,
const
float
beta1
,
const
float
beta2
,
const
float
beta3
,
const
int
*
step_ptr
,
const
int
bias_correction
,
const
float
epsilon
,
adamMode_t
mode
,
const
float
decay
,
const
float
*
global_grad_norm
,
const
float
*
max_global_grad_norm
,
const
float
*
found_inf
,
const
float
*
inv_scale
)
{
if
(
*
noop_gmem
)
{
return
;
}
float
beta1_correction
=
1.0
f
;
float
beta2_correction
=
1.0
f
;
if
(
bias_correction
==
1
)
{
int
step
=
*
step_ptr
;
beta1_correction
=
1
-
std
::
pow
(
beta1
,
step
);
beta2_correction
=
1
-
std
::
pow
(
beta2
,
step
);
}
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
float
clipped_global_grad_norm
=
(
*
global_grad_norm
)
>
(
*
max_global_grad_norm
)
?
(
*
global_grad_norm
)
/
(
*
max_global_grad_norm
)
:
1.0
f
;
T
*
g
=
(
T
*
)
tl
.
addresses
[
0
][
tensor_loc
];
g
+=
chunk_idx
*
chunk_size
;
param_t
*
p
=
(
param_t
*
)
tl
.
addresses
[
1
][
tensor_loc
];
p
+=
chunk_idx
*
chunk_size
;
param_t
*
m
=
(
param_t
*
)
tl
.
addresses
[
2
][
tensor_loc
];
m
+=
chunk_idx
*
chunk_size
;
param_t
*
v
=
(
param_t
*
)
tl
.
addresses
[
3
][
tensor_loc
];
v
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
g
)
&&
is_aligned
(
p
)
&&
is_aligned
(
m
)
&&
is_aligned
(
v
))
{
T
l_g
[
ILP
];
param_t
l_p
[
ILP
];
param_t
l_m
[
ILP
];
param_t
l_v
[
ILP
];
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
l_g
,
g
,
0
,
i_start
);
if
(
decay
!=
0
)
load_store
(
l_p
,
p
,
0
,
i_start
);
load_store
(
l_m
,
m
,
0
,
i_start
);
load_store
(
l_v
,
v
,
0
,
i_start
);
// unpack
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_g
[
ii
]
=
l_g
[
ii
]
*
(
*
inv_scale
);
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
l_p
[
ii
];
}
r_m
[
ii
]
=
l_m
[
ii
];
r_v
[
ii
]
=
l_v
[
ii
];
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
l_p
[
ii
]
=
r_p
[
ii
];
// Difference from APEX's LAMB kernel. `g` and `p` can be different dtypes.
l_g
[
ii
]
=
r_p
[
ii
];
l_m
[
ii
]
=
r_m
[
ii
];
l_v
[
ii
]
=
r_v
[
ii
];
}
// store
load_store
(
g
,
l_g
,
i_start
,
0
);
load_store
(
m
,
l_m
,
i_start
,
0
);
load_store
(
v
,
l_v
,
i_start
,
0
);
}
}
else
{
// see note in multi_tensor_scale_kernel.cu
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_g
[
ii
]
=
g
[
i
]
*
(
*
inv_scale
);
// special ?optimization? for lamb stage 1
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
p
[
i
];
}
r_m
[
ii
]
=
m
[
i
];
r_v
[
ii
]
=
v
[
i
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
g
[
i
]
=
r_p
[
ii
];
m
[
i
]
=
r_m
[
ii
];
v
[
i
]
=
r_v
[
ii
];
}
}
}
}
}
};
// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
// N == 2: FP32 params, no master params
// N == 3: FP16 params, FP32 master params.
template
<
typename
T
,
int
N
,
typename
param_t
>
struct
LAMBStage2Functor
{
static_assert
((
N
==
2
&&
std
::
is_same
<
T
,
param_t
>::
value
)
||
(
N
==
3
&&
std
::
is_same
<
param_t
,
float
>::
value
),
""
);
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
N
>&
tl
,
const
float
*
per_tensor_param_norm
,
const
float
*
per_tensor_update_norm
,
const
float
*
learning_rate
,
const
float
decay
,
bool
use_nvlamb
)
{
if
(
*
noop_gmem
)
{
return
;
}
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
tensor_num
=
tl
.
start_tensor_this_launch
+
tensor_loc
;
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
MATH_T
ratio
=
*
learning_rate
;
// nvlamb: apply adaptive learning rate to all parameters
// otherwise, only apply to those with non-zero weight decay
if
(
use_nvlamb
||
(
decay
!=
0.0
))
{
float
param_norm
=
per_tensor_param_norm
[
tensor_num
];
float
update_norm
=
per_tensor_update_norm
[
tensor_num
];
ratio
=
(
update_norm
!=
0.0
f
&&
param_norm
!=
0.0
f
)
?
*
learning_rate
*
(
param_norm
/
update_norm
)
:
*
learning_rate
;
}
T
*
update
=
(
T
*
)
tl
.
addresses
[
0
][
tensor_loc
];
update
+=
chunk_idx
*
chunk_size
;
param_t
*
p
=
(
param_t
*
)
tl
.
addresses
[
1
][
tensor_loc
];
p
+=
chunk_idx
*
chunk_size
;
T
*
out_p
;
if
(
N
==
3
)
{
out_p
=
(
T
*
)
tl
.
addresses
[
2
][
tensor_loc
];
out_p
+=
chunk_idx
*
chunk_size
;
}
n
-=
chunk_idx
*
chunk_size
;
// to make things simple, we put aligned case in a different code path
bool
can_use_aligned_path
=
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
p
)
&&
is_aligned
(
update
);
if
(
N
==
3
)
{
can_use_aligned_path
=
can_use_aligned_path
&&
is_aligned
(
out_p
);
}
if
(
can_use_aligned_path
)
{
param_t
r_p
[
ILP
];
T
r_update
[
ILP
];
T
r_out_p
[
ILP
];
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_p
,
p
,
0
,
i_start
);
load_store
(
r_update
,
update
,
0
,
i_start
);
if
(
N
==
3
)
{
load_store
(
r_out_p
,
out_p
,
0
,
i_start
);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
static_cast
<
MATH_T
>
(
r_p
[
ii
])
-
(
ratio
*
static_cast
<
MATH_T
>
(
r_update
[
ii
]));
if
(
N
==
3
)
{
r_out_p
[
ii
]
=
r_p
[
ii
];
}
}
load_store
(
p
,
r_p
,
i_start
,
0
);
if
(
N
==
3
)
{
load_store
(
out_p
,
r_out_p
,
i_start
,
0
);
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
MATH_T
r_p
[
ILP
];
MATH_T
r_update
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_p
[
ii
]
=
p
[
i
];
r_update
[
ii
]
=
update
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
p
[
i
]
=
r_p
[
ii
];
if
(
N
==
3
)
{
out_p
[
i
]
=
r_p
[
ii
];
}
}
}
}
}
}
};
void
multi_tensor_lamb_mp_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
Tensor
lr
,
const
float
beta1
,
const
float
beta2
,
const
float
epsilon
,
at
::
Tensor
step
,
const
int
bias_correction
,
const
float
weight_decay
,
const
int
grad_averaging
,
const
int
mode
,
at
::
Tensor
global_grad_norm
,
at
::
Tensor
max_grad_norm
,
at
::
optional
<
bool
>
use_nvlamb_python
,
at
::
Tensor
found_inf
,
at
::
Tensor
inv_scale
)
{
// n_tensors == 5: FP16 model params & FP32 master params
// n_tensors == 4: FP32 model params & NO FP32 master params
const
auto
n_tensors
=
tensor_lists
.
size
();
assert
(
n_tensors
==
4
||
n_tensors
==
5
);
using
namespace
at
;
bool
use_nvlamb
=
use_nvlamb_python
.
has_value
()
?
use_nvlamb_python
.
value
()
:
false
;
// note(mkozuki): move bias handling below to functor
// Handle bias correction mode
// float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
// if (bias_correction == 1) {
// bias_correction1 = 1 - std::pow(beta1, step);
// bias_correction2 = 1 - std::pow(beta2, step);
// }
// Handle grad averaging mode
float
beta3
=
1.0
f
;
if
(
grad_averaging
==
1
)
beta3
=
1
-
beta1
;
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
stage1_tensor_lists
(
tensor_lists
.
begin
(),
tensor_lists
.
begin
()
+
4
);
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
grad_list
(
tensor_lists
.
begin
(),
tensor_lists
.
begin
()
+
1
);
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
param_list
(
tensor_lists
.
begin
()
+
1
,
tensor_lists
.
begin
()
+
2
);
// Compute per tensor param norm
auto
param_norm_tuple
=
multi_tensor_l2norm_mp_cuda
(
chunk_size
,
noop_flag
,
param_list
,
true
);
// We now in-place modify grad to store update before compute its norm
// Generally this is not a issue since people modify grad in step() method all the time
// We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
if
(
n_tensors
==
4
)
{
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"lamb_stage_1"
,
multi_tensor_apply
<
4
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
stage1_tensor_lists
,
LAMBStage1Functor
<
scalar_t_0
,
scalar_t_0
>
(),
beta1
,
beta2
,
beta3
,
// 1-beta1 or 1 depends on averaging mode
// bias_correction1,
// bias_correction2,
step
.
data_ptr
<
int
>
(),
bias_correction
,
epsilon
,
(
adamMode_t
)
mode
,
weight_decay
,
global_grad_norm
.
data_ptr
<
float
>
(),
max_grad_norm
.
data_ptr
<
float
>
(),
found_inf
.
data_ptr
<
float
>
(),
inv_scale
.
data_ptr
<
float
>
());
)
}
else
{
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"lamb_stage_1"
,
multi_tensor_apply
<
4
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
stage1_tensor_lists
,
LAMBStage1Functor
<
scalar_t_0
,
float
>
(),
beta1
,
beta2
,
beta3
,
// 1-beta1 or 1 depends on averaging mode
// bias_correction1,
// bias_correction2,
step
.
data_ptr
<
int
>
(),
bias_correction
,
epsilon
,
(
adamMode_t
)
mode
,
weight_decay
,
global_grad_norm
.
data_ptr
<
float
>
(),
max_grad_norm
.
data_ptr
<
float
>
(),
found_inf
.
data_ptr
<
float
>
(),
inv_scale
.
data_ptr
<
float
>
());
)
}
// Compute update norms
auto
update_norm_tuple
=
multi_tensor_l2norm_mp_cuda
(
chunk_size
,
noop_flag
,
grad_list
,
true
);
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
grad_param_list
(
tensor_lists
.
begin
(),
tensor_lists
.
begin
()
+
2
);
if
(
n_tensors
==
4
)
{
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"lamb_stage_2"
,
multi_tensor_apply
<
2
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
grad_param_list
,
LAMBStage2Functor
<
scalar_t_0
,
2
,
scalar_t_0
>
(),
std
::
get
<
1
>
(
param_norm_tuple
).
data_ptr
<
float
>
(),
std
::
get
<
1
>
(
update_norm_tuple
).
data_ptr
<
float
>
(),
lr
.
data_ptr
<
float
>
(),
weight_decay
,
use_nvlamb
);
)
}
else
{
grad_param_list
.
push_back
(
tensor_lists
[
4
]);
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"lamb_stage_2"
,
multi_tensor_apply
<
3
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
grad_param_list
,
LAMBStage2Functor
<
scalar_t_0
,
3
,
float
>
(),
std
::
get
<
1
>
(
param_norm_tuple
).
data_ptr
<
float
>
(),
std
::
get
<
1
>
(
update_norm_tuple
).
data_ptr
<
float
>
(),
lr
.
data_ptr
<
float
>
(),
weight_decay
,
use_nvlamb
);
)
}
AT_CUDA_CHECK
(
cudaGetLastError
());
}
setup.py
View file @
db92ee13
...
@@ -197,13 +197,15 @@ if "--cuda_ext" in sys.argv:
...
@@ -197,13 +197,15 @@ if "--cuda_ext" in sys.argv:
'csrc/multi_tensor_scale_kernel.cu'
,
'csrc/multi_tensor_scale_kernel.cu'
,
'csrc/multi_tensor_axpby_kernel.cu'
,
'csrc/multi_tensor_axpby_kernel.cu'
,
'csrc/multi_tensor_l2norm_kernel.cu'
,
'csrc/multi_tensor_l2norm_kernel.cu'
,
'csrc/multi_tensor_l2norm_kernel_mp.cu'
,
'csrc/multi_tensor_l2norm_scale_kernel.cu'
,
'csrc/multi_tensor_l2norm_scale_kernel.cu'
,
'csrc/multi_tensor_lamb_stage_1.cu'
,
'csrc/multi_tensor_lamb_stage_1.cu'
,
'csrc/multi_tensor_lamb_stage_2.cu'
,
'csrc/multi_tensor_lamb_stage_2.cu'
,
'csrc/multi_tensor_adam.cu'
,
'csrc/multi_tensor_adam.cu'
,
'csrc/multi_tensor_adagrad.cu'
,
'csrc/multi_tensor_adagrad.cu'
,
'csrc/multi_tensor_novograd.cu'
,
'csrc/multi_tensor_novograd.cu'
,
'csrc/multi_tensor_lamb.cu'
],
'csrc/multi_tensor_lamb.cu'
,
'csrc/multi_tensor_lamb_mp.cu'
],
include_dirs
=
[
os
.
path
.
join
(
this_dir
,
'csrc'
)],
include_dirs
=
[
os
.
path
.
join
(
this_dir
,
'csrc'
)],
extra_compile_args
=
{
'cxx'
:
[
'-O3'
]
+
version_dependent_macros
,
extra_compile_args
=
{
'cxx'
:
[
'-O3'
]
+
version_dependent_macros
,
'nvcc'
:
nvcc_args_multi_tensor
if
not
IS_ROCM_PYTORCH
else
hipcc_args_multi_tensor
}))
'nvcc'
:
nvcc_args_multi_tensor
if
not
IS_ROCM_PYTORCH
else
hipcc_args_multi_tensor
}))
...
@@ -390,7 +392,11 @@ if "--fast_layer_norm" in sys.argv:
...
@@ -390,7 +392,11 @@ if "--fast_layer_norm" in sys.argv:
'-gencode'
,
'arch=compute_70,code=sm_70'
,
'-gencode'
,
'arch=compute_70,code=sm_70'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'-Iapex/contrib/csrc/layer_norm'
,
'-U__CUDA_NO_BFLOAT16_OPERATORS__'
,
'-U__CUDA_NO_BFLOAT16_CONVERSIONS__'
,
'-U__CUDA_NO_BFLOAT162_OPERATORS__'
,
'-U__CUDA_NO_BFLOAT162_CONVERSIONS__'
,
'-I./apex/contrib/csrc/layer_norm/'
,
'--expt-relaxed-constexpr'
,
'--expt-relaxed-constexpr'
,
'--expt-extended-lambda'
,
'--expt-extended-lambda'
,
'--use_fast_math'
]
+
version_dependent_macros
+
generator_flag
+
cc_flag
},
'--use_fast_math'
]
+
version_dependent_macros
+
generator_flag
+
cc_flag
},
...
...
tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
View file @
db92ee13
...
@@ -36,9 +36,75 @@ class TestFusedLayerNorm(unittest.TestCase):
...
@@ -36,9 +36,75 @@ class TestFusedLayerNorm(unittest.TestCase):
class
TestFusedLayerNormElemWise
(
TestFusedLayerNorm
):
class
TestFusedLayerNormElemWise
(
TestFusedLayerNorm
):
elementwise_affine
=
True
class
TestFusedLayerNormElemWiseHalf
(
TestFusedLayerNormElemWise
):
dtype
=
torch
.
half
def
test_large_batch
(
self
):
self
.
skipTest
(
"Skip to save time"
)
class
TestFusedLayerNormElemWiseBFloat16
(
TestFusedLayerNormElemWise
):
dtype
=
torch
.
bfloat16
# NOTE (mkozuki): [BFloat16 Layer Norm flakiness]
# Use thresholds larger than those used in pytorch, see
# https://github.com/pytorch/pytorch/blob/72274e2a2fd55019ec860e1743dbdc5b0c5a5624/torch/testing/_asserts.py#L26
fwd_thresholds
=
dict
(
rtol
=
1.6e-2
,
atol
=
3e-4
)
bwd_thresholds
=
dict
(
rtol
=
1.6e-2
,
atol
=
3e-3
)
def
test_large_batch
(
self
):
self
.
skipTest
(
"Skip to save time"
)
def
_prep_layers
(
normalized_shape
,
elementwise_affine
,
dtype
):
native
=
torch
.
nn
.
LayerNorm
(
normalized_shape
=
normalized_shape
,
elementwise_affine
=
elementwise_affine
).
to
(
device
=
"cuda"
,
dtype
=
dtype
)
fused
=
apex
.
normalization
.
FusedLayerNorm
(
normalized_shape
=
normalized_shape
,
elementwise_affine
=
elementwise_affine
).
cuda
()
return
native
,
fused
def
_prep_inputs
(
batch_size
,
normalized_shape
,
dtype
):
shape
=
(
batch_size
,
*
normalized_shape
)
fused
=
torch
.
randn
(
shape
).
cuda
().
requires_grad_
(
True
)
with
torch
.
no_grad
():
native
=
fused
.
clone
().
to
(
dtype
).
requires_grad_
(
True
)
return
native
,
fused
TORCH_MAJOR
,
TORCH_MINOR
=
int
(
torch
.
__version__
.
split
(
'.'
)[
0
]),
int
(
torch
.
__version__
.
split
(
'.'
)[
1
])
if
(
TORCH_MAJOR
<=
1
and
TORCH_MINOR
<
10
):
autocast_dtypes
=
(
torch
.
half
,)
else
:
autocast_dtypes
=
(
torch
.
half
,
torch
.
bfloat16
)
if
torch
.
cuda
.
is_bf16_supported
()
else
(
torch
.
half
,)
class
TestAutocastFusedLayerNorm
(
unittest
.
TestCase
):
bf16_fwd_thresholds
=
dict
(
rtol
=
1.6e-2
,
atol
=
3e-4
)
bf16_bwd_thresholds
=
dict
(
rtol
=
1.6e-2
,
atol
=
3e-3
)
def
setUp
(
self
):
def
setUp
(
self
):
self
.
module_cpu_
=
apex
.
normalization
.
FusedLayerNorm
(
normalized_shape
=
[
32
,
16
],
elementwise_affine
=
True
).
cpu
()
self
.
batch_size
=
16
self
.
module_cuda_
=
apex
.
normalization
.
FusedLayerNorm
(
normalized_shape
=
[
32
,
16
],
elementwise_affine
=
True
).
cuda
()
self
.
normalized_shape
=
[
32
,
16
]
def
_run_test
(
self
,
dtype
,
elementwise_affine
):
native
,
fused
=
_prep_layers
(
self
.
normalized_shape
,
elementwise_affine
,
dtype
)
native_x
,
fused_x
=
_prep_inputs
(
self
.
batch_size
,
self
.
normalized_shape
,
dtype
)
expected
=
native
(
native_x
)
with
torch
.
cuda
.
amp
.
autocast
(
dtype
=
dtype
):
actual
=
fused
(
fused_x
)
tols
=
{
'rtol'
:
None
,
'atol'
:
None
}
if
dtype
==
torch
.
half
else
TestAutocastFusedLayerNorm
.
bf16_fwd_thresholds
torch
.
testing
.
assert_allclose
(
actual
,
expected
,
**
tols
)
g_native
=
torch
.
rand_like
(
expected
)
with
torch
.
no_grad
():
g_fused
=
g_native
.
clone
()
expected
.
backward
(
g_native
)
actual
.
backward
(
g_fused
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
tests/L0/run_optimizers/test_lamb.py
View file @
db92ee13
...
@@ -144,14 +144,14 @@ class RefLAMB(Optimizer):
...
@@ -144,14 +144,14 @@ class RefLAMB(Optimizer):
return
loss
return
loss
class
TestLamb
(
unittest
.
TestCase
):
class
TestFusedLAMB
(
unittest
.
TestCase
):
def
setUp
(
self
,
max_abs_diff
=
1e-3
,
max_rel_diff
=
1
,
iters
=
7
):
def
setUp
(
self
,
max_abs_diff
=
1e-3
,
max_rel_diff
=
1
,
iters
=
7
):
self
.
max_abs_diff
=
max_abs_diff
self
.
max_abs_diff
=
max_abs_diff
self
.
max_rel_diff
=
max_rel_diff
self
.
max_rel_diff
=
max_rel_diff
self
.
iters
=
iters
self
.
iters
=
iters
torch
.
cuda
.
manual_seed
(
9876
)
torch
.
cuda
.
manual_seed
(
9876
)
def
tearDown
(
self
):
def
tearDown
(
self
):
pass
pass
...
@@ -162,8 +162,8 @@ class TestFusedLAMB(unittest.TestCase):
...
@@ -162,8 +162,8 @@ class TestFusedLAMB(unittest.TestCase):
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
ref_optim
=
RefLAMB
(
ref_param
,
**
lamb_option
)
ref_optim
=
self
.
ref_optim
(
ref_param
,
**
lamb_option
)
tst_optim
=
apex
.
optimizers
.
FusedLAMB
(
tst_param
,
use_nvlamb
=
True
,
**
lamb_option
)
tst_optim
=
self
.
tst_optim
(
tst_param
,
use_nvlamb
=
True
,
**
lamb_option
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
...
@@ -211,6 +211,13 @@ class TestFusedLAMB(unittest.TestCase):
...
@@ -211,6 +211,13 @@ class TestFusedLAMB(unittest.TestCase):
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedLAMB
(
TestLamb
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestLamb
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
ref_optim
=
RefLAMB
self
.
tst_optim
=
apex
.
optimizers
.
FusedLAMB
def
test_float
(
self
):
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
...
@@ -264,6 +271,65 @@ class TestFusedLAMB(unittest.TestCase):
...
@@ -264,6 +271,65 @@ class TestFusedLAMB(unittest.TestCase):
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedMixedPrecisionLamb
(
TestLamb
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestLamb
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
ref_optim
=
RefLAMB
self
.
tst_optim
=
apex
.
optimizers
.
FusedMixedPrecisionLamb
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
@
unittest
.
skip
(
"PyTorch optimizer is not numerically correct for fp16"
)
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
wd
}
tensors
=
[]
for
size
in
sizes
:
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
'cuda'
))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
(
tensors
,
lamb_option
)
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_lamb_option
(
self
):
nelem
=
1
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
0.01
,
'betas'
:(
0.6
,
0.9
),
'eps'
:
3e-06
,
'weight_decay'
:
wd
}
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
lamb_option
)
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
script_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
script_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
...
...
tests/L0/run_transformer/run_bert_minimal_test.py
0 → 100644
View file @
db92ee13
import
random
import
torch
from
apex.transformer
import
tensor_parallel
from
apex.transformer
import
parallel_state
from
apex.transformer.tensor_parallel
import
vocab_parallel_cross_entropy
from
apex.transformer.pipeline_parallel.utils
import
setup_microbatch_calculator
from
apex.transformer.pipeline_parallel.utils
import
average_losses_across_data_parallel_group
from
apex.transformer.pipeline_parallel.schedules
import
get_forward_backward_func
from
apex.transformer.pipeline_parallel.schedules.common
import
build_model
from
apex.transformer.pipeline_parallel.schedules.common
import
_get_params_for_weight_decay_optimization
from
apex.transformer.testing.standalone_bert
import
bert_model_provider
from
apex.transformer.testing
import
global_vars
from
apex.transformer.testing.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.testing.commons
import
initialize_distributed
from
apex.transformer.testing.commons
import
print_separator
mode
=
None
MANUAL_SEED
=
42
inds
=
None
masks
=
None
data_idx
=
0
MASK_PROB
=
0.1
EASY_MODE
=
False
EASY_MODE_SIZ
=
32
ONCE
=
False
# download a public domain book as corpus
def
download_fancy_data
():
#import requests
#response = requests.get('https://internet.com/book.txt')
#text = ' '.join(response.text.split())
text
=
"""
An original sentence not subject to any license restrictions, copyright, or royalty payments. Nothing to see here. Commercial or non-commercial use. Research or non-research purposes. The quick brown fox jumps over the lazy dog. Lorem ipsum.
"""
text
=
text
*
1024
encoded
=
text
.
encode
(
'ascii'
,
'replace'
)
ints
=
[
int
(
encoded
[
i
])
for
i
in
range
(
len
(
encoded
))]
return
torch
.
tensor
(
ints
)
# build a batch given sequence_len and batch size
def
generate_fancy_data_labels
(
sequence_len
,
batch_size
):
global
data_idx
global
inds
global
masks
global
MANUAL_SEED
temps
=
list
()
for
i
in
range
(
batch_size
):
if
inds
is
None
or
data_idx
>=
len
(
inds
):
# hack as use of RNG will fall out of sync due to pipelines being different
torch
.
manual_seed
(
MANUAL_SEED
)
inds
=
torch
.
randperm
(
effective_length
,
device
=
'cuda'
)
masks
=
(
torch
.
rand
(
len
(
inds
)
//
batch_size
+
1
,
batch_size
,
sequence_len
,
device
=
'cuda'
)
>=
MASK_PROB
).
long
()
MANUAL_SEED
+=
1
print
(
"new epoch"
,
len
(
inds
))
data_idx
=
0
print
(
"my start"
,
inds
[
0
:
5
])
print
(
"masks_checksum:"
,
torch
.
sum
(
masks
))
if
EASY_MODE
:
data_idx_
=
data_idx
%
EASY_MODE_SIZ
else
:
data_idx_
=
data_idx
offset
=
inds
[
data_idx_
]
#* SEQUENCE_LEN
data_idx
+=
1
curr
=
fancy_data
[
offset
:
offset
+
sequence_len
].
clone
().
detach
()
temps
.
append
(
curr
)
temp
=
torch
.
stack
(
temps
,
dim
=
0
).
cuda
()
mask
=
masks
[
data_idx
//
batch_size
]
mask_not
=
torch
.
logical_not
(
mask
)
data
=
mask
*
temp
+
mask_not
*
124
label
=
temp
return
(
data
,
label
,
mask_not
)
easy_data
=
None
def
fwd_step_func
(
batch
,
model
):
data
,
label
,
loss_mask
=
batch
data
=
data
.
cuda
()
label
=
label
.
cuda
()
loss_mask
=
loss_mask
.
cuda
()
y
=
model
(
data
,
torch
.
ones_like
(
data
),
lm_labels
=
label
)
def
loss_func
(
output_tensor
):
global
ONCE
output_tensor
,
_
=
output_tensor
lm_loss_
=
output_tensor
.
float
()
lm_loss
=
torch
.
sum
(
lm_loss_
.
view
(
-
1
)
*
loss_mask
.
reshape
(
-
1
))
/
loss_mask
.
sum
()
averaged_loss
=
average_losses_across_data_parallel_group
([
lm_loss
])
if
data_idx
>=
1536
:
assert
lm_loss
<
4.8
if
not
ONCE
:
print
(
"LOSS OK"
)
ONCE
=
True
return
lm_loss
,
{
'avg'
:
averaged_loss
}
return
y
,
loss_func
def
train
(
model
,
optim
,
virtual_pipeline_model_parallel_size
,
pipeline_model_parallel_size
):
sequence_len
=
global_vars
.
get_args
().
seq_length
micro_batch_size
=
global_vars
.
get_args
().
micro_batch_size
hidden_size
=
global_vars
.
get_args
().
hidden_size
forward_backward_func
=
get_forward_backward_func
(
virtual_pipeline_model_parallel_size
,
pipeline_model_parallel_size
)
tensor_shape
=
(
args
.
seq_length
,
args
.
micro_batch_size
,
args
.
hidden_size
)
for
_
in
range
(
8
):
batch
=
generate_fancy_data_labels
(
sequence_len
,
batch_size
)
optim
.
zero_grad
()
forward_backward_func
(
fwd_step_func
,
batch
,
model
,
forward_only
=
False
,
tensor_shape
=
tensor_shape
)
optim
.
step
()
if
__name__
==
'__main__'
:
global
fancy_data
global
effective_length
global_vars
.
set_global_variables
()
fancy_data
=
download_fancy_data
()
effective_length
=
fancy_data
.
size
(
0
)
//
global_vars
.
get_args
().
seq_length
effective_length
=
fancy_data
.
size
(
0
)
-
global_vars
.
get_args
().
seq_length
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
failure
=
None
try
:
args
=
global_vars
.
get_args
()
args
.
padded_vocab_size
=
128
# needed in standalone gpt
batch_size
=
args
.
global_batch_size
micro_batch_size
=
args
.
micro_batch_size
setup_microbatch_calculator
(
args
.
rank
,
args
.
rampup_batch_size
,
args
.
global_batch_size
,
args
.
micro_batch_size
,
1
,
# args.data_parallel_size,
)
virtual_pipeline_model_parallel_size
=
2
world_size
=
torch
.
distributed
.
get_world_size
()
pipeline_model_parallel_size
=
world_size
parallel_state
.
initialize_model_parallel
(
1
,
pipeline_model_parallel_size
,
virtual_pipeline_model_parallel_size
)
pipeline_model_parallel_size
=
parallel_state
.
get_pipeline_model_parallel_world_size
()
tensor_parallel
.
random
.
model_parallel_cuda_manual_seed
(
0
)
model
=
build_model
(
bert_model_provider
,
wrap_with_ddp
=
True
,
virtual_pipeline_model_parallel_size
=
virtual_pipeline_model_parallel_size
,
)
assert
isinstance
(
model
,
list
)
assert
len
(
model
)
==
(
1
if
virtual_pipeline_model_parallel_size
is
None
else
virtual_pipeline_model_parallel_size
)
_param_groups
=
_get_params_for_weight_decay_optimization
(
model
)
optim
=
torch
.
optim
.
Adam
(
_param_groups
)
print
(
effective_length
)
print
(
fancy_data
.
size
(
0
))
train
(
model
,
optim
,
virtual_pipeline_model_parallel_size
,
pipeline_model_parallel_size
)
except
Exception
as
e
:
failure
=
str
(
e
)
finally
:
parallel_state
.
destroy_model_parallel
()
if
failure
is
not
None
:
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
f
"Minimal BERT Pipeline Parallel Failed with:
{
failure
}
"
)
else
:
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
TEST_SUCCESS_MESSAGE
)
tests/L0/run_transformer/run_cross_entropy_test.py
View file @
db92ee13
# coding=utf-8
# coding=utf-8
# Copyright (c) 202
0
, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 202
1
, NVIDIA CORPORATION. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -15,15 +15,15 @@
...
@@ -15,15 +15,15 @@
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
apex.transformer.tensor_parallel.tests.commons
import
set_random_seed
from
apex.transformer.tensor_parallel.tests.commons
import
IdentityLayer
from
apex.transformer.tensor_parallel.tests.commons
import
print_separator
from
apex.transformer.tensor_parallel.tests.commons
import
initialize_distributed
from
apex.transformer.tensor_parallel.tests.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer
import
parallel_state
from
apex.transformer
import
parallel_state
from
apex.transformer
import
tensor_parallel
from
apex.transformer
import
tensor_parallel
from
apex.transformer.tensor_parallel.cross_entropy
import
vocab_parallel_cross_entropy
from
apex.transformer.tensor_parallel.cross_entropy
import
vocab_parallel_cross_entropy
from
apex.transformer.tensor_parallel.tests
import
global_vars
from
apex.transformer.testing
import
global_vars
from
apex.transformer.testing.commons
import
set_random_seed
from
apex.transformer.testing.commons
import
IdentityLayer
from
apex.transformer.testing.commons
import
print_separator
from
apex.transformer.testing.commons
import
initialize_distributed
from
apex.transformer.testing.commons
import
TEST_SUCCESS_MESSAGE
global_vars
.
set_global_variables
()
global_vars
.
set_global_variables
()
...
@@ -51,8 +51,11 @@ def tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scal
...
@@ -51,8 +51,11 @@ def tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scal
logits_parallel
=
tensor_parallel
.
scatter_to_tensor_model_parallel_region
(
logits
)
logits_parallel
=
tensor_parallel
.
scatter_to_tensor_model_parallel_region
(
logits
)
target
=
torch
.
cuda
.
LongTensor
(
target
=
torch
.
cuda
.
LongTensor
(
size
=
(
batch_size
,
seq_length
)).
random_
(
0
,
vocab_size
)
size
=
(
batch_size
,
seq_length
)).
random_
(
0
,
vocab_size
)
logits_parallel_
=
logits_parallel
.
clone
().
detach
()
loss
=
vocab_parallel_cross_entropy
(
logits_parallel
,
target
).
mean
()
loss
=
vocab_parallel_cross_entropy
(
logits_parallel
,
target
).
mean
()
loss
.
backward
()
loss
.
backward
()
# check for mutation
assert
torch
.
equal
(
logits_parallel_
,
logits_parallel
)
return
loss
,
identity
.
weight
.
grad
return
loss
,
identity
.
weight
.
grad
...
@@ -94,6 +97,8 @@ def test_cross_entropy(tensor_model_parallel_size):
...
@@ -94,6 +97,8 @@ def test_cross_entropy(tensor_model_parallel_size):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
initialize_distributed
()
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
world_size
=
torch
.
distributed
.
get_world_size
()
...
...
tests/L0/run_transformer/run_data_test.py
View file @
db92ee13
# coding=utf-8
# coding=utf-8
# Copyright (c) 202
0
, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 202
1
, NVIDIA CORPORATION. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -19,10 +19,10 @@ import torch
...
@@ -19,10 +19,10 @@ import torch
from
apex.transformer
import
parallel_state
from
apex.transformer
import
parallel_state
from
apex.transformer.tensor_parallel
import
data
as
data_utils
from
apex.transformer.tensor_parallel
import
data
as
data_utils
from
apex.transformer.te
nsor_parallel.tests
import
global_vars
from
apex.transformer.te
sting
import
global_vars
from
apex.transformer.te
nsor_parallel.tests
.commons
import
print_separator
from
apex.transformer.te
sting
.commons
import
print_separator
from
apex.transformer.te
nsor_parallel.tests
.commons
import
initialize_distributed
from
apex.transformer.te
sting
.commons
import
initialize_distributed
from
apex.transformer.te
nsor_parallel.tests
.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.te
sting
.commons
import
TEST_SUCCESS_MESSAGE
global_vars
.
set_global_variables
()
global_vars
.
set_global_variables
()
...
@@ -82,6 +82,8 @@ def test_broadcast_data(tensor_model_parallel_size):
...
@@ -82,6 +82,8 @@ def test_broadcast_data(tensor_model_parallel_size):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
initialize_distributed
()
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
world_size
=
torch
.
distributed
.
get_world_size
()
...
...
tests/L0/run_transformer/run_dynamic_batchsize_test.py
0 → 100644
View file @
db92ee13
from
typing
import
Tuple
,
List
import
torch
from
apex.transformer
import
parallel_state
from
apex.transformer.pipeline_parallel.utils
import
get_num_microbatches
from
apex.transformer.pipeline_parallel.schedules.common
import
(
_get_params_for_weight_decay_optimization
,
)
from
apex.transformer.pipeline_parallel.schedules.common
import
build_model
from
apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving
import
(
_forward_backward_pipelining_with_interleaving
,
)
from
apex.transformer.pipeline_parallel.utils
import
average_losses_across_data_parallel_group
from
apex.transformer.pipeline_parallel.utils
import
setup_microbatch_calculator
from
apex.transformer.pipeline_parallel.utils
import
_reconfigure_microbatch_calculator
from
apex.transformer.pipeline_parallel.utils
import
update_num_microbatches
from
apex.transformer.testing
import
global_vars
from
apex.transformer.testing.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.testing.commons
import
initialize_distributed
from
apex.transformer.testing.commons
import
print_separator
from
apex.transformer.log_util
import
get_transformer_logger
,
set_logging_level
from
apex.transformer.testing.commons
import
model_provider_func
from
apex.transformer._data
import
MegatronPretrainingRandomSampler
from
apex.transformer._data
import
MegatronPretrainingSampler
# note(mkozuki): To see warmup, steady, cooldown iterations, uncomment the line below
# set_logging_level("INFO")
_logger
=
get_transformer_logger
(
"pipeline_parallel_test"
)
# note(mkozuki): To see if local batch size increases, uncomment the line below
# _logger.setLevel("INFO")
global_vars
.
set_global_variables
(
args_defaults
=
{
"global_batch_size"
:
512
,
"rampup_batch_size"
:
[
32
,
32
,
1000
],},
ignore_unknown_args
=
True
,
)
RAMPUP_BATCH_SIZE
=
[]
NUM_ITERATIONS
=
20
NUM_SAMPLES
=
16384
//
2
batch_size
,
micro_batch_size
=
None
,
None
HIDDEN_SIZE
=
16
def
Dataset
(
num_samples
:
int
)
->
List
[
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]]:
return
[(
torch
.
randn
(
HIDDEN_SIZE
),
torch
.
randn
(
HIDDEN_SIZE
//
2
))
for
_
in
range
(
num_samples
)]
def
process_batch
(
batch
):
if
isinstance
(
batch
,
(
list
,
tuple
)):
x
=
batch
[
0
]
else
:
x
=
batch
return
x
def
fwd_step_func
(
micro_batch
,
model
):
x
=
process_batch
(
micro_batch
)
y
=
model
(
x
)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def
loss_func
(
x
):
loss
=
torch
.
sum
(
x
)
averaged_loss
=
average_losses_across_data_parallel_group
([
loss
])
return
loss
,
{
"avg"
:
averaged_loss
}
return
y
,
loss_func
# Run forward & backward with dynamic batch size.
def
run_interleaved_with_dynamic_batch_size
(
pipeline_model_parallel_size
:
int
,
forward_only
:
bool
,
BatchSamplerCls
,
)
->
None
:
args
=
global_vars
.
get_args
()
_reconfigure_microbatch_calculator
(
args
.
rank
,
args
.
rampup_batch_size
,
args
.
global_batch_size
,
args
.
micro_batch_size
,
1
,
# args.data_parallel_size,
)
virtual_pipeline_model_parallel_size
=
2
# NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is a requisite for the interleaving scheduling
# In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and
# used ubiquitously but this test uses custom model so it's safe to abuse.
parallel_state
.
initialize_model_parallel
(
1
,
pipeline_model_parallel_size
,
virtual_pipeline_model_parallel_size
)
pipeline_model_parallel_size
=
parallel_state
.
get_pipeline_model_parallel_world_size
()
print_separator
(
f
"BatchSamplerCls:
{
BatchSamplerCls
.
__name__
}
, forward_only:
{
forward_only
}
"
)
model
=
build_model
(
model_provider_func
,
wrap_with_ddp
=
True
,
virtual_pipeline_model_parallel_size
=
virtual_pipeline_model_parallel_size
,
hidden_size
=
HIDDEN_SIZE
,
)
assert
isinstance
(
model
,
list
)
assert
len
(
model
)
==
virtual_pipeline_model_parallel_size
optimizer
=
torch
.
optim
.
Adam
(
_get_params_for_weight_decay_optimization
(
model
))
initial_local_minibatch_size
=
get_num_microbatches
()
*
micro_batch_size
dataset
=
Dataset
(
NUM_SAMPLES
)
data_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
BatchSamplerCls
(
NUM_SAMPLES
,
0
,
initial_local_minibatch_size
,
parallel_state
.
get_data_parallel_rank
(),
parallel_state
.
get_data_parallel_world_size
(),
),
)
data_iter
=
iter
(
data_loader
)
def
get_num_samples
(
batch
):
if
isinstance
(
batch
,
torch
.
Tensor
):
return
len
(
batch
)
assert
isinstance
(
batch
,
(
list
,
tuple
))
return
[
get_num_samples
(
b
)
for
b
in
batch
]
tensor_shape
=
[
micro_batch_size
,
HIDDEN_SIZE
]
consumed_samples
=
0
for
i
in
range
(
NUM_ITERATIONS
):
update_num_microbatches
(
consumed_samples
,
consistency_check
=
False
)
local_batch_size
=
get_num_microbatches
()
*
micro_batch_size
data_iter
.
_index_sampler
.
local_minibatch_size
=
local_batch_size
local_mini_batch
=
next
(
data_iter
)
_logger
.
info
(
f
"iter:
{
i
}
/
{
NUM_ITERATIONS
}
"
f
"local batchsize:
{
get_num_samples
(
local_mini_batch
)
}
"
f
"consumed_samples:
{
consumed_samples
}
/
{
NUM_SAMPLES
}
"
)
_forward_backward_pipelining_with_interleaving
(
fwd_step_func
,
local_mini_batch
,
model
,
forward_only
=
forward_only
,
tensor_shape
=
tensor_shape
,
)
consumed_samples
+=
(
parallel_state
.
get_data_parallel_world_size
()
*
get_num_microbatches
()
*
micro_batch_size
)
if
not
forward_only
:
for
m
in
model
:
for
p
in
m
.
parameters
():
if
p
.
grad
is
None
:
raise
RuntimeError
(
"grad not found"
)
else
:
optimizer
.
zero_grad
(
set_to_none
=
True
)
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
TEST_SUCCESS_MESSAGE
)
if
__name__
==
"__main__"
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
n_tests
=
0
failures
=
[]
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
args
=
global_vars
.
get_args
()
batch_size
=
args
.
global_batch_size
micro_batch_size
=
args
.
micro_batch_size
setup_microbatch_calculator
(
args
.
rank
,
args
.
rampup_batch_size
,
args
.
global_batch_size
,
args
.
micro_batch_size
,
1
,
# args.data_parallel_size,
)
for
BatchSamplerCls
in
(
MegatronPretrainingSampler
,
MegatronPretrainingRandomSampler
):
for
forward_only
in
(
False
,
True
):
n_tests
+=
1
pipeline_model_parallel_size
=
world_size
try
:
run_interleaved_with_dynamic_batch_size
(
pipeline_model_parallel_size
,
forward_only
,
BatchSamplerCls
,
)
except
Exception
as
e
:
msg
=
(
f
"
\t
forward_only:
{
forward_only
}
\n
"
f
"pipeline rank:
{
parallel_state
.
get_pipeline_model_parallel_rank
()
}
, "
f
"virtual pipeline rank:
{
parallel_state
.
get_virtual_pipeline_model_parallel_rank
()
}
\n
"
f
"
{
str
(
e
)
}
"
)
raise
RuntimeError
(
msg
)
finally
:
parallel_state
.
destroy_model_parallel
()
print_separator
(
"TEST RESULT"
)
if
failures
:
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
"
\n
"
.
join
(
failures
))
msg
=
f
"
{
len
(
failures
)
}
/
{
n_tests
}
cases failed"
raise
RuntimeError
(
msg
)
else
:
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
"### PASS!"
)
tests/L0/run_transformer/run_initialize_test.py
View file @
db92ee13
# coding=utf-8
# coding=utf-8
# Copyright (c) 202
0
, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 202
1
, NVIDIA CORPORATION. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -15,10 +15,10 @@
...
@@ -15,10 +15,10 @@
import
torch
import
torch
from
apex.transformer
import
parallel_state
from
apex.transformer
import
parallel_state
from
apex.transformer.te
nsor_parallel.tests
import
global_vars
from
apex.transformer.te
sting
import
global_vars
from
apex.transformer.te
nsor_parallel.tests
.commons
import
print_separator
from
apex.transformer.te
sting
.commons
import
print_separator
from
apex.transformer.te
nsor_parallel.tests
.commons
import
initialize_distributed
from
apex.transformer.te
sting
.commons
import
initialize_distributed
from
apex.transformer.te
nsor_parallel.tests
.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.te
sting
.commons
import
TEST_SUCCESS_MESSAGE
global_vars
.
set_global_variables
()
global_vars
.
set_global_variables
()
...
@@ -90,6 +90,8 @@ def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
...
@@ -90,6 +90,8 @@ def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
initialize_distributed
()
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
world_size
=
torch
.
distributed
.
get_world_size
()
...
...
tests/L0/run_transformer/run_layers_test.py
View file @
db92ee13
# coding=utf-8
# coding=utf-8
# Copyright (c) 202
0
, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 202
1
, NVIDIA CORPORATION. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -18,11 +18,11 @@ from torch.nn.parameter import Parameter
...
@@ -18,11 +18,11 @@ from torch.nn.parameter import Parameter
from
apex.transformer
import
parallel_state
from
apex.transformer
import
parallel_state
from
apex.transformer.tensor_parallel
import
layers
from
apex.transformer.tensor_parallel
import
layers
from
apex.transformer.te
nsor_parallel.tests
import
global_vars
from
apex.transformer.te
sting
import
global_vars
from
apex.transformer.te
nsor_parallel.tests
.commons
import
set_random_seed
from
apex.transformer.te
sting
.commons
import
set_random_seed
from
apex.transformer.te
nsor_parallel.tests
.commons
import
print_separator
from
apex.transformer.te
sting
.commons
import
print_separator
from
apex.transformer.te
nsor_parallel.tests
.commons
import
initialize_distributed
from
apex.transformer.te
sting
.commons
import
initialize_distributed
from
apex.transformer.te
nsor_parallel.tests
.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.te
sting
.commons
import
TEST_SUCCESS_MESSAGE
global_vars
.
set_global_variables
()
global_vars
.
set_global_variables
()
...
@@ -584,7 +584,6 @@ def test_parallel_transformer_layer(tensor_model_parallel_size):
...
@@ -584,7 +584,6 @@ def test_parallel_transformer_layer(tensor_model_parallel_size):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
torch
.
backends
.
cudnn
.
benchmark
=
False
...
...
tests/L0/run_transformer/run_mappings_test.py
View file @
db92ee13
import
torch
import
torch
from
apex.transformer
import
parallel_state
from
apex.transformer
import
parallel_state
from
apex.transformer.tensor_parallel.tests.commons
import
initialize_distributed
from
apex.transformer.tensor_parallel
import
mappings
from
apex.transformer.tensor_parallel
import
mappings
from
apex.transformer.tensor_parallel.tests
import
global_vars
from
apex.transformer.testing
import
global_vars
from
apex.transformer.testing.commons
import
initialize_distributed
global_vars
.
set_global_variables
()
global_vars
.
set_global_variables
()
...
@@ -48,6 +48,8 @@ def test__gather(args, tensor_model_parallel_size):
...
@@ -48,6 +48,8 @@ def test__gather(args, tensor_model_parallel_size):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
initialize_distributed
()
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
world_size
=
torch
.
distributed
.
get_world_size
()
...
...
tests/L0/run_transformer/run_megatron_gpt_pipeline.py
0 → 100644
View file @
db92ee13
from
functools
import
partial
import
logging
from
typing
import
List
import
torch
from
apex.transformer
import
parallel_state
from
apex.transformer.pipeline_parallel.schedules.common
import
_get_params_for_weight_decay_optimization
from
apex.transformer.pipeline_parallel.schedules.common
import
build_model
from
apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving
import
_forward_backward_pipelining_with_interleaving
from
apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving
import
forward_backward_pipelining_without_interleaving
from
apex.transformer.pipeline_parallel.utils
import
average_losses_across_data_parallel_group
from
apex.transformer.pipeline_parallel.utils
import
get_ltor_masks_and_position_ids
from
apex.transformer.pipeline_parallel.utils
import
setup_microbatch_calculator
from
apex.transformer.pipeline_parallel.utils
import
update_num_microbatches
from
apex.transformer.tensor_parallel
import
model_parallel_cuda_manual_seed
from
apex.transformer.testing
import
global_vars
from
apex.transformer.testing.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.testing.commons
import
initialize_distributed
from
apex.transformer.testing.commons
import
print_separator
from
apex.transformer.testing.standalone_gpt
import
gpt_model_provider
from
apex.transformer.log_util
import
get_transformer_logger
,
set_logging_level
set_logging_level
(
logging
.
NOTSET
)
_logger
=
get_transformer_logger
(
"megatron_gpt_pipeline_test"
)
global_vars
.
set_global_variables
()
N_VOCAB
=
8192
def
generate_batch
(
batch_size
,
sequence_length
):
size
=
batch_size
,
sequence_length
+
1
int_tensor
=
torch
.
randint
(
low
=
0
,
high
=
N_VOCAB
,
size
=
size
,
dtype
=
torch
.
long
).
cuda
()
return
int_tensor
,
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L44
def
get_batch
(
int_tensors
:
List
[
torch
.
Tensor
]):
data
=
int_tensors
[
0
]
# Unpack.
tokens_
=
data
.
long
()
labels
=
tokens_
[:,
1
:].
contiguous
()
tokens
=
tokens_
[:,
:
-
1
].
contiguous
()
# Get the masks and position ids.
attention_mask
,
loss_mask
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
N_VOCAB
,
# tokenizer.eod,
False
,
# args.reset_position_ids,
False
,
# args.reset_attention_mask,
False
,
# args.eod_mask_loss,
)
return
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L75
def
loss_func
(
loss_mask
,
output_tensor
):
losses
=
output_tensor
.
float
()
loss_mask
=
loss_mask
.
view
(
-
1
).
float
()
loss
=
torch
.
sum
(
losses
.
view
(
-
1
)
*
loss_mask
)
/
loss_mask
.
sum
()
# Reduce loss for logging.
averaged_loss
=
average_losses_across_data_parallel_group
([
loss
])
return
loss
,
{
'lm loss'
:
averaged_loss
[
0
]}
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L86
# TODO (mkozuki): Currently I'm seeing no attribute `word_embeddings` which looks weird.
def
forward_step
(
batch
,
model
):
"""Forward step."""
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
=
get_batch
(
batch
)
output_tensor
=
model
(
tokens
,
position_ids
,
attention_mask
,
labels
=
labels
)
return
output_tensor
,
partial
(
loss_func
,
loss_mask
)
def
run_gpt
(
pipeline_model_parallel_size
,
virtual_pipeline_model_parallel_size
=
None
,
forward_only
=
False
):
parallel_state
.
initialize_model_parallel
(
1
,
pipeline_model_parallel_size
,
virtual_pipeline_model_parallel_size
)
model_parallel_cuda_manual_seed
(
42
)
model
=
build_model
(
gpt_model_provider
,
True
,
virtual_pipeline_model_parallel_size
=
virtual_pipeline_model_parallel_size
)
_logger
.
debug
(
"building model"
)
assert
isinstance
(
model
,
list
)
assert
len
(
model
)
==
(
1
or
virtual_pipeline_model_parallel_size
)
_param_groups
=
_get_params_for_weight_decay_optimization
(
model
)
torch
.
optim
.
Adam
(
_param_groups
)
if
parallel_state
.
is_pipeline_last_stage
():
_logger
.
debug
(
"checking `word_embeddings` existence"
)
for
m
in
model
:
assert
hasattr
(
m
,
"word_embeddings"
)
args
=
global_vars
.
get_args
()
if
virtual_pipeline_model_parallel_size
is
None
:
batch
=
generate_batch
(
args
.
global_batch_size
,
args
.
seq_length
)
else
:
batch
=
[
generate_batch
(
args
.
global_batch_size
,
args
.
seq_length
)
for
_
in
range
(
virtual_pipeline_model_parallel_size
)]
_logger
.
debug
(
"preparing batch"
)
if
virtual_pipeline_model_parallel_size
is
None
:
fwd_bwd_func
=
forward_backward_pipelining_without_interleaving
else
:
fwd_bwd_func
=
_forward_backward_pipelining_with_interleaving
_logger
.
debug
(
f
"selecting forward_backward func:
{
fwd_bwd_func
}
"
)
tensor_shape
=
(
args
.
seq_length
,
args
.
micro_batch_size
,
args
.
hidden_size
)
_logger
.
debug
(
f
"`tensor_shape`:
{
tensor_shape
}
"
)
fwd_bwd_func
(
forward_step
,
batch
,
model
,
forward_only
=
forward_only
,
tensor_shape
=
tensor_shape
)
_logger
.
debug
(
TEST_SUCCESS_MESSAGE
)
if
__name__
==
"__main__"
:
initialize_distributed
()
args
=
global_vars
.
get_args
()
args
.
padded_vocab_size
=
N_VOCAB
setup_microbatch_calculator
(
args
.
rank
,
args
.
rampup_batch_size
,
args
.
global_batch_size
,
args
.
micro_batch_size
,
1
,
# args.data_parallel_size,
)
update_num_microbatches
(
0
,
True
)
print_separator
(
"run GPT model"
)
try
:
run_gpt
(
torch
.
distributed
.
get_world_size
())
# TODO(mkozuki): handle exception correctly, but for now, lazily commenting out as
# this won't get kicked by CI
except
Exception
as
e
:
_logger
.
debug
(
str
(
e
))
pass
finally
:
parallel_state
.
destroy_model_parallel
()
tests/L0/run_transformer/run_pipeline_parallel_test.py
0 → 100644
View file @
db92ee13
from
typing
import
Optional
,
Union
,
List
import
torch
import
torch.nn
as
nn
import
apex
from
apex.transformer
import
parallel_state
from
apex.transformer.pipeline_parallel
import
get_forward_backward_func
from
apex.transformer.pipeline_parallel.schedules.common
import
_get_params_for_weight_decay_optimization
from
apex.transformer.pipeline_parallel.schedules.common
import
build_model
from
apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining
import
forward_backward_no_pipelining
from
apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving
import
_forward_backward_pipelining_with_interleaving
from
apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving
import
forward_backward_pipelining_without_interleaving
from
apex.transformer.pipeline_parallel.utils
import
average_losses_across_data_parallel_group
from
apex.transformer.pipeline_parallel.utils
import
setup_microbatch_calculator
from
apex.transformer.pipeline_parallel.utils
import
update_num_microbatches
from
apex.transformer.testing
import
global_vars
from
apex.transformer.testing.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.testing.commons
import
initialize_distributed
from
apex.transformer.testing.commons
import
print_separator
from
apex.transformer.log_util
import
get_transformer_logger
,
set_logging_level
# set_logging_level("INFO")
_logger
=
get_transformer_logger
(
"pipeline_parallel_test"
)
global_vars
.
set_global_variables
()
batch_size
,
micro_batch_size
=
None
,
None
hidden_size
=
16
fwd_bwd_functions
=
{
"no_pipelining"
:
forward_backward_no_pipelining
,
"no_interleaving"
:
forward_backward_pipelining_without_interleaving
,
"interleaving"
:
_forward_backward_pipelining_with_interleaving
,
}
# note (mkozuki): `pre_process` and `post_process` are a placeholder until interleaving schedule test comes.
class
MyLayer
(
nn
.
Module
):
def
__init__
(
self
,
pre_process
:
bool
,
post_process
:
bool
):
super
().
__init__
()
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
layer
=
nn
.
Linear
(
hidden_size
,
hidden_size
)
def
forward
(
self
,
x
):
return
self
.
layer
(
x
)
class
MyModel
(
nn
.
Module
):
def
__init__
(
self
,
pre_process
:
bool
=
False
,
post_process
:
bool
=
False
)
->
None
:
super
().
__init__
()
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
layer
=
MyLayer
(
pre_process
=
pre_process
,
post_process
=
post_process
)
self
.
input_tensor
=
None
def
set_input_tensor
(
self
,
input_tensor
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]])
->
None
:
self
.
input_tensor
=
input_tensor
def
forward
(
self
,
x
:
Optional
[
torch
.
Tensor
])
->
torch
.
Tensor
:
if
self
.
input_tensor
is
None
:
return
self
.
layer
(
x
)
return
self
.
layer
(
self
.
input_tensor
)
def
model_provider_func
(
pre_process
,
post_process
)
->
MyModel
:
return
MyModel
(
pre_process
,
post_process
)
def
process_batch
(
batch
):
if
isinstance
(
batch
,
list
):
x
=
batch
[
0
]
else
:
x
=
batch
return
x
def
fwd_step_func
(
batch
,
model
):
x
=
process_batch
(
batch
)
y
=
model
(
x
)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def
loss_func
(
x
):
loss
=
torch
.
sum
(
x
)
averaged_loss
=
average_losses_across_data_parallel_group
([
loss
])
return
loss
,
{
'avg'
:
averaged_loss
}
return
y
,
loss_func
# TODO (mkozuki): Add a case with `autocast` and `GradScaler`.
# Run forward & backward for one minibatch.
def
forward_backward_func_template
(
name
:
str
,
forward_backward_func
,
pipeline_model_parallel_size
:
int
,
forward_only
:
bool
,
)
->
None
:
print_separator
(
f
"name:
{
name
}
, pipeline model parallel size:
{
pipeline_model_parallel_size
}
"
)
virtual_pipeline_model_parallel_size
=
2
if
name
==
"interleaving"
else
None
if
name
==
"no_pipelining"
:
# note (mkozuki): `forward_backward_no_pipelining` is **NOTE** compatible with
# pipeline_model_parallel_size>1. So use pipeline_model_parallel_size as
# tensor_model_parallel_size and set pipeline_model_parallel_size to 1.
parallel_state
.
initialize_model_parallel
(
1
,
1
,
None
)
else
:
# NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is necessary to enable interleaving scheduling
# In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and
# used ubiquitously but this test uses custom model so it's safe to abuse.
parallel_state
.
initialize_model_parallel
(
1
,
pipeline_model_parallel_size
,
virtual_pipeline_model_parallel_size
)
if
virtual_pipeline_model_parallel_size
is
not
None
:
# Check the experimental warning message
get_forward_backward_func
(
virtual_pipeline_model_parallel_size
,
pipeline_model_parallel_size
)
pipeline_model_parallel_size
=
parallel_state
.
get_pipeline_model_parallel_world_size
()
model
=
build_model
(
model_provider_func
,
wrap_with_ddp
=
True
,
virtual_pipeline_model_parallel_size
=
virtual_pipeline_model_parallel_size
,
)
assert
isinstance
(
model
,
list
)
assert
len
(
model
)
==
(
1
if
virtual_pipeline_model_parallel_size
is
None
else
virtual_pipeline_model_parallel_size
)
_param_groups
=
_get_params_for_weight_decay_optimization
(
model
)
torch
.
optim
.
Adam
(
_param_groups
,
lr
=
1e-4
)
tensor_shape
=
[
batch_size
//
parallel_state
.
get_data_parallel_world_size
(),
hidden_size
]
batch
=
(
torch
.
randn
(
tensor_shape
).
cuda
(),)
tensor_shape
[
0
]
=
micro_batch_size
update_num_microbatches
(
0
)
forward_backward_func
(
fwd_step_func
,
batch
,
model
,
forward_only
=
forward_only
,
tensor_shape
=
tensor_shape
)
if
not
forward_only
:
for
m
in
model
:
for
p
in
m
.
parameters
():
if
p
.
grad
is
None
:
raise
RuntimeError
(
"grad not found"
)
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
TEST_SUCCESS_MESSAGE
)
if
__name__
==
"__main__"
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
n_tests
=
0
failures
=
[]
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
args
=
global_vars
.
get_args
()
batch_size
=
args
.
global_batch_size
micro_batch_size
=
args
.
micro_batch_size
setup_microbatch_calculator
(
args
.
rank
,
args
.
rampup_batch_size
,
args
.
global_batch_size
,
args
.
micro_batch_size
,
1
,
# args.data_parallel_size,
)
for
forward_only
in
(
True
,
False
):
for
name
,
forward_backward_func
in
fwd_bwd_functions
.
items
():
n_tests
+=
1
# TODO (mkozuki): Test with data parallel size > 1.
pipeline_model_parallel_size
=
world_size
try
:
forward_backward_func_template
(
name
,
forward_backward_func
,
pipeline_model_parallel_size
,
forward_only
,
)
except
Exception
as
e
:
failures
.
append
(
f
"
\t
#
{
name
}
failed with pipeline size:
{
pipeline_model_parallel_size
}
"
f
"and forward_only:
{
forward_only
}
\n
"
f
"pipeline rank:
{
parallel_state
.
get_pipeline_model_parallel_rank
()
}
, "
f
"virtual pipeline rank:
{
parallel_state
.
get_virtual_pipeline_model_parallel_rank
()
}
\n
"
f
"
{
str
(
e
)
}
"
)
finally
:
parallel_state
.
destroy_model_parallel
()
else
:
print_separator
(
f
"
{
name
}
works"
)
print_separator
(
"TEST RESULT"
)
if
failures
:
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
"
\n
"
.
join
(
failures
))
msg
=
f
"
{
len
(
failures
)
}
/
{
n_tests
}
cases failed"
raise
RuntimeError
(
msg
)
else
:
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
"### PASS!"
)
tests/L0/run_transformer/run_random_test.py
View file @
db92ee13
# coding=utf-8
# coding=utf-8
# Copyright (c) 202
0
, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 202
1
, NVIDIA CORPORATION. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -16,10 +16,10 @@ import torch
...
@@ -16,10 +16,10 @@ import torch
from
apex.transformer
import
parallel_state
from
apex.transformer
import
parallel_state
from
apex.transformer
import
tensor_parallel
from
apex.transformer
import
tensor_parallel
from
apex.transformer.te
nsor_parallel.tests
import
global_vars
from
apex.transformer.te
sting
import
global_vars
from
apex.transformer.te
nsor_parallel.tests
.commons
import
print_separator
from
apex.transformer.te
sting
.commons
import
print_separator
from
apex.transformer.te
nsor_parallel.tests
.commons
import
initialize_distributed
from
apex.transformer.te
sting
.commons
import
initialize_distributed
from
apex.transformer.te
nsor_parallel.tests
.commons
import
TEST_SUCCESS_MESSAGE
from
apex.transformer.te
sting
.commons
import
TEST_SUCCESS_MESSAGE
global_vars
.
set_global_variables
()
global_vars
.
set_global_variables
()
...
@@ -188,6 +188,8 @@ def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
...
@@ -188,6 +188,8 @@ def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
initialize_distributed
()
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
world_size
=
torch
.
distributed
.
get_world_size
()
...
...
tests/L0/run_transformer/run_utils_test.py
View file @
db92ee13
...
@@ -15,6 +15,8 @@ def test_split_tensor_along_last_dim():
...
@@ -15,6 +15,8 @@ def test_split_tensor_along_last_dim():
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cudnn
.
allow_tf32
=
False
test_divide
()
test_divide
()
test_split_tensor_along_last_dim
()
test_split_tensor_along_last_dim
()
print
(
">> passed the test :-)"
)
print
(
">> passed the test :-)"
)
tests/L0/run_transformer/test_batch_sampler.py
0 → 100644
View file @
db92ee13
from
itertools
import
product
import
unittest
import
torch
from
torch.utils.data
import
Dataset
from
torch.utils.data
import
RandomSampler
from
torch.utils.data
import
BatchSampler
from
torch.utils.data
import
DataLoader
from
apex.transformer.pipeline_parallel.utils
import
_split_batch_into_microbatch
as
split_batch_into_microbatch
class
MyIterableDataset
(
Dataset
):
def
__init__
(
self
,
start
,
end
):
super
().
__init__
()
assert
end
>
start
,
"this example code only works with end >= start"
self
.
start
=
start
self
.
end
=
end
self
.
samples
=
list
(
range
(
self
.
start
,
self
.
end
))
def
__iter__
(
self
):
return
iter
(
range
(
self
.
start
,
self
.
end
))
def
__getitem__
(
self
,
index
):
return
self
.
samples
[
index
]
class
MegatronPretrainingRandomSampler
:
def
__init__
(
self
,
total_samples
,
consumed_samples
,
micro_batch_size
,
data_parallel_rank
,
data_parallel_size
):
# Keep a copy of input params for later use.
self
.
total_samples
=
total_samples
self
.
consumed_samples
=
consumed_samples
self
.
micro_batch_size
=
micro_batch_size
self
.
data_parallel_rank
=
data_parallel_rank
self
.
data_parallel_size
=
data_parallel_size
self
.
micro_batch_times_data_parallel_size
=
\
self
.
micro_batch_size
*
data_parallel_size
self
.
last_batch_size
=
\
self
.
total_samples
%
self
.
micro_batch_times_data_parallel_size
# Sanity checks.
assert
self
.
total_samples
>
0
,
\
'no sample to consume: {}'
.
format
(
self
.
total_samples
)
assert
self
.
micro_batch_size
>
0
assert
data_parallel_size
>
0
assert
self
.
data_parallel_rank
<
data_parallel_size
,
\
'data_parallel_rank should be smaller than data size: {}, '
\
'{}'
.
format
(
self
.
data_parallel_rank
,
data_parallel_size
)
def
__len__
(
self
):
return
self
.
total_samples
def
__iter__
(
self
):
active_total_samples
=
self
.
total_samples
-
self
.
last_batch_size
self
.
epoch
=
self
.
consumed_samples
//
active_total_samples
current_epoch_samples
=
self
.
consumed_samples
%
active_total_samples
assert
current_epoch_samples
%
self
.
micro_batch_times_data_parallel_size
==
0
# data sharding and random sampling
bucket_size
=
(
self
.
total_samples
//
self
.
micro_batch_times_data_parallel_size
)
*
self
.
micro_batch_size
bucket_offset
=
current_epoch_samples
//
self
.
data_parallel_size
start_idx
=
self
.
data_parallel_rank
*
bucket_size
g
=
torch
.
Generator
()
g
.
manual_seed
(
self
.
epoch
)
random_idx
=
torch
.
randperm
(
bucket_size
,
generator
=
g
).
tolist
()
idx_range
=
[
start_idx
+
x
for
x
in
random_idx
[
bucket_offset
:]]
batch
=
[]
# Last batch if not complete will be dropped.
for
idx
in
idx_range
:
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
micro_batch_size
:
self
.
consumed_samples
+=
self
.
micro_batch_times_data_parallel_size
yield
batch
batch
=
[]
# Samples 8 tensors in total.
# First sample 4 tensors twice, then sample 2 tensors fourth.
class
TestBatchSamplerBehavior
(
unittest
.
TestCase
):
def
test_batch_sampler_behavior
(
self
):
dataset
=
MyIterableDataset
(
0
,
100
)
for
num_workers
in
(
1
,
2
,
4
):
with
self
.
subTest
(
f
"
{
num_workers
}
"
):
torch
.
manual_seed
(
42
)
loader
=
DataLoader
(
dataset
,
batch_sampler
=
MegatronPretrainingRandomSampler
(
100
,
0
,
4
,
0
,
1
),
num_workers
=
num_workers
)
samples
=
[]
for
i
,
batch
in
enumerate
(
loader
):
samples
.
append
(
batch
)
if
i
==
2
-
1
:
break
torch
.
manual_seed
(
42
)
loader
=
DataLoader
(
dataset
,
batch_sampler
=
MegatronPretrainingRandomSampler
(
100
,
0
,
2
,
0
,
1
),
num_workers
=
num_workers
)
samples2
=
[]
for
i
,
batch
in
enumerate
(
loader
):
samples2
.
append
(
batch
)
if
i
==
4
-
1
:
break
torch
.
testing
.
assert_allclose
(
torch
.
cat
(
samples
),
torch
.
cat
(
samples2
))
def
test_split_batch
(
self
):
class
MyIterableDataset
(
Dataset
):
def
__init__
(
self
,
start
,
end
):
super
().
__init__
()
assert
end
>
start
,
"this example code only works with end >= start"
self
.
start
=
start
self
.
end
=
end
self
.
samples
=
list
(
range
(
self
.
start
,
self
.
end
))
def
__len__
(
self
):
return
self
.
end
-
self
.
start
def
__iter__
(
self
):
return
iter
(
range
(
self
.
start
,
self
.
end
))
def
__getitem__
(
self
,
index
):
return
(
torch
.
tensor
([
index
,
index
]),
torch
.
tensor
([
index
//
2
,
index
//
2
]))
dataset
=
MyIterableDataset
(
0
,
100
)
torch
.
manual_seed
(
42
)
global_batch_size
=
16
loader
=
DataLoader
(
dataset
,
batch_sampler
=
MegatronPretrainingRandomSampler
(
100
,
0
,
global_batch_size
,
0
,
1
),
num_workers
=
2
)
batch
=
next
(
iter
(
loader
))
# samples = None
# for i, batch in enumerate(loader):
# # samples = batch
# if i == 0:
# break
for
_micro_batch_size
in
(
1
,
2
,
4
,
8
):
microbatches
=
list
(
split_batch_into_microbatch
(
batch
,
_micro_batch_size
=
_micro_batch_size
,
_global_batch_size
=
global_batch_size
,
))
# print(batch)
# print(microbatches)
self
.
assertEqual
(
len
(
microbatches
),
global_batch_size
//
_micro_batch_size
)
self
.
assertEqual
(
len
(
microbatches
[
0
][
0
]),
_micro_batch_size
)
if
__name__
==
"__main__"
:
unittest
.
main
()
tests/L0/run_transformer/test_
mpu
.py
→
tests/L0/run_transformer/test_
transformer_module
.py
View file @
db92ee13
from
typing
import
Tuple
import
os
import
os
import
subprocess
import
subprocess
import
sys
import
sys
import
unittest
import
unittest
def
run_mpu_tests
():
DENY_TEST
=
[
"megatron_gpt_pipeline"
,
]
MULTIGPU_TEST
=
[
"pipeline_parallel_test"
,
"dynamic_batchsize_test"
,
]
SEVERALGPU_TEST
=
[
"bert_minimal_test"
,
]
def
get_multigpu_launch_option
(
min_gpu
):
should_skip
=
False
import
torch
num_devices
=
torch
.
cuda
.
device_count
()
if
num_devices
<
min_gpu
:
should_skip
=
True
distributed_run_options
=
f
"-m torch.distributed.run --nproc_per_node=
{
num_devices
}
"
return
should_skip
,
distributed_run_options
def
get_launch_option
(
test_filename
)
->
Tuple
[
bool
,
str
]:
should_skip
=
False
for
multigpu_test
in
MULTIGPU_TEST
:
if
multigpu_test
in
test_filename
:
return
get_multigpu_launch_option
(
2
)
for
severalgpu_test
in
SEVERALGPU_TEST
:
if
severalgpu_test
in
test_filename
:
return
get_multigpu_launch_option
(
3
)
return
should_skip
,
""
def
run_transformer_tests
():
python_executable_path
=
sys
.
executable
python_executable_path
=
sys
.
executable
# repository_root = os.path.join(os.path.dirname(__file__), "../../../")
# repository_root = os.path.join(os.path.dirname(__file__), "../../../")
# directory = os.path.abspath(os.path.join(repository_root, "tests/mpu"))
# directory = os.path.abspath(os.path.join(repository_root, "tests/mpu"))
...
@@ -19,7 +51,28 @@ def run_mpu_tests():
...
@@ -19,7 +51,28 @@ def run_mpu_tests():
print
(
"#######################################################"
)
print
(
"#######################################################"
)
errors
=
[]
errors
=
[]
for
i
,
test_file
in
enumerate
(
files
,
1
):
for
i
,
test_file
in
enumerate
(
files
,
1
):
test_run_cmd
=
f
"NVIDIA_TF32_OVERRIDE=0
{
python_executable_path
}
{
test_file
}
--micro-batch-size 2 --num-layers 1 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 32 --encoder-seq-length 32 --use-cpu-initialization"
# NOQA
is_denied
=
False
for
deny_file
in
DENY_TEST
:
if
deny_file
in
test_file
:
is_denied
=
True
if
is_denied
:
print
(
f
"###
{
i
}
/
{
len
(
files
)
}
:
{
test_file
}
skipped"
)
continue
should_skip
,
launch_option
=
get_launch_option
(
test_file
)
if
should_skip
:
print
(
f
"###
{
i
}
/
{
len
(
files
)
}
:
{
test_file
}
skipped. Requires multiple GPUs."
)
continue
test_run_cmd
=
(
f
"
{
python_executable_path
}
{
launch_option
}
{
test_file
}
"
"--micro-batch-size 4 --num-layers 16 --hidden-size 768 --num-attention-heads 8 --max-position-embeddings "
"512 --seq-length 512 --global-batch-size 256"
)
if
'bert'
in
test_file
:
import
torch
num_devices
=
torch
.
cuda
.
device_count
()
test_run_cmd
+=
f
" --pipeline-model-parallel-size
{
num_devices
}
"
else
:
test_run_cmd
+=
f
" --use-cpu-initialization"
print
(
f
"###
{
i
}
/
{
len
(
files
)
}
: cmd:
{
test_run_cmd
}
"
)
print
(
f
"###
{
i
}
/
{
len
(
files
)
}
: cmd:
{
test_run_cmd
}
"
)
try
:
try
:
output
=
subprocess
.
check_output
(
output
=
subprocess
.
check_output
(
...
@@ -29,7 +82,7 @@ def run_mpu_tests():
...
@@ -29,7 +82,7 @@ def run_mpu_tests():
errors
.
append
((
test_file
,
str
(
e
)))
errors
.
append
((
test_file
,
str
(
e
)))
else
:
else
:
if
'>> passed the test :-)'
not
in
output
:
if
'>> passed the test :-)'
not
in
output
:
errors
.
append
(
test_file
,
output
)
errors
.
append
(
(
test_file
,
output
)
)
else
:
else
:
if
not
errors
:
if
not
errors
:
print
(
"### PASSED"
)
print
(
"### PASSED"
)
...
@@ -42,10 +95,10 @@ def run_mpu_tests():
...
@@ -42,10 +95,10 @@ def run_mpu_tests():
raise
RuntimeError
(
short_msg
)
raise
RuntimeError
(
short_msg
)
class
Test
MPU
(
unittest
.
TestCase
):
class
Test
Transformer
(
unittest
.
TestCase
):
def
test_
mpu
(
self
):
def
test_
transformer
(
self
):
run_
mpu
_tests
()
run_
transformer
_tests
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment