Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
0bfb8300
Commit
0bfb8300
authored
May 08, 2020
by
Thor Johnsen
Browse files
Merge
parents
2619f1cb
cf50dc7c
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2429 additions
and
104 deletions
+2429
-104
apex/amp/_initialize.py
apex/amp/_initialize.py
+1
-1
apex/amp/handle.py
apex/amp/handle.py
+1
-1
apex/contrib/csrc/multihead_attn/dropout.h
apex/contrib/csrc/multihead_attn/dropout.h
+16
-0
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
+0
-1
apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
+21
-0
apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu
apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu
+294
-0
apex/contrib/csrc/xentropy/xentropy_kernel.cu
apex/contrib/csrc/xentropy/xentropy_kernel.cu
+178
-67
apex/contrib/multihead_attn/self_multihead_attn_func.py
apex/contrib/multihead_attn/self_multihead_attn_func.py
+1
-1
apex/contrib/optimizers/__init__.py
apex/contrib/optimizers/__init__.py
+1
-0
apex/contrib/optimizers/fp16_optimizer.py
apex/contrib/optimizers/fp16_optimizer.py
+2
-1
apex/contrib/optimizers/fused_lamb.py
apex/contrib/optimizers/fused_lamb.py
+208
-0
apex/mlp/__init__.py
apex/mlp/__init__.py
+1
-0
apex/mlp/mlp.py
apex/mlp/mlp.py
+79
-0
apex/parallel/LARC.py
apex/parallel/LARC.py
+12
-1
apex/pyprof/__init__.py
apex/pyprof/__init__.py
+1
-1
apex/pyprof/prof/__init__.py
apex/pyprof/prof/__init__.py
+1
-0
apex/pyprof/prof/utility.py
apex/pyprof/prof/utility.py
+5
-3
csrc/mlp.cpp
csrc/mlp.cpp
+164
-0
csrc/mlp_cuda.cu
csrc/mlp_cuda.cu
+1377
-0
csrc/multi_tensor_axpby_kernel.cu
csrc/multi_tensor_axpby_kernel.cu
+66
-27
No files found.
apex/amp/_initialize.py
View file @
0bfb8300
...
@@ -146,7 +146,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
...
@@ -146,7 +146,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
from
.amp
import
init
as
amp_init
from
.amp
import
init
as
amp_init
optimizers_was_list
=
False
optimizers_was_list
=
False
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
(
'LARC'
in
sys
.
modules
and
isinstance
(
optimizers
,
LARC
)):
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
(
'LARC'
in
globals
()
and
isinstance
(
optimizers
,
LARC
)):
optimizers
=
[
optimizers
]
optimizers
=
[
optimizers
]
elif
optimizers
is
None
:
elif
optimizers
is
None
:
optimizers
=
[]
optimizers
=
[]
...
...
apex/amp/handle.py
View file @
0bfb8300
...
@@ -87,7 +87,7 @@ def scale_loss(loss,
...
@@ -87,7 +87,7 @@ def scale_loss(loss,
yield
loss
yield
loss
return
return
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
(
'LARC'
in
sys
.
modules
and
isinstance
(
optimizers
,
LARC
)):
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
(
'LARC'
in
globals
()
and
isinstance
(
optimizers
,
LARC
)):
optimizers
=
[
optimizers
]
optimizers
=
[
optimizers
]
loss_scaler
=
_amp_state
.
loss_scalers
[
loss_id
]
loss_scaler
=
_amp_state
.
loss_scalers
[
loss_id
]
...
...
apex/contrib/csrc/multihead_attn/dropout.h
View file @
0bfb8300
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#ifdef OLD_GENERATOR
#include <ATen/CUDAGenerator.h>
#include <ATen/CUDAGenerator.h>
#else
#include <ATen/CUDAGeneratorImpl.h>
#endif
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <curand_kernel.h>
#include <curand_kernel.h>
...
@@ -206,8 +212,13 @@ void apex_fused_dropout_cuda(scalar_t const *inputs,
...
@@ -206,8 +212,13 @@ void apex_fused_dropout_cuda(scalar_t const *inputs,
std
::
pair
<
uint64_t
,
uint64_t
>
rng_engine_inputs
;
std
::
pair
<
uint64_t
,
uint64_t
>
rng_engine_inputs
;
{
{
// See Note [Acquire lock when using random generators]
// See Note [Acquire lock when using random generators]
#ifdef OLD_GENERATOR
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
->
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
->
mutex_
);
rng_engine_inputs
=
gen
->
philox_engine_inputs
(
counter_offset
);
rng_engine_inputs
=
gen
->
philox_engine_inputs
(
counter_offset
);
#else
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
.
mutex
());
rng_engine_inputs
=
at
::
check_generator
<
at
::
CUDAGeneratorImpl
>
(
gen
)
->
philox_engine_inputs
(
counter_offset
);
#endif
}
}
apex_fused_dropout_kernel
<
scalar_t
,
accscalar_t
,
IndexType
><<<
grid
,
dim_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
inputs
,
outputs
,
mask
,
totalElements
,
p
,
rng_engine_inputs
);
apex_fused_dropout_kernel
<
scalar_t
,
accscalar_t
,
IndexType
><<<
grid
,
dim_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
inputs
,
outputs
,
mask
,
totalElements
,
p
,
rng_engine_inputs
);
...
@@ -239,8 +250,13 @@ void apex_dropout_add_cuda(scalar_t const *inputs,
...
@@ -239,8 +250,13 @@ void apex_dropout_add_cuda(scalar_t const *inputs,
std
::
pair
<
uint64_t
,
uint64_t
>
rng_engine_inputs
;
std
::
pair
<
uint64_t
,
uint64_t
>
rng_engine_inputs
;
{
{
// See Note [Acquire lock when using random generators]
// See Note [Acquire lock when using random generators]
#ifdef OLD_GENERATOR
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
->
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
->
mutex_
);
rng_engine_inputs
=
gen
->
philox_engine_inputs
(
counter_offset
);
rng_engine_inputs
=
gen
->
philox_engine_inputs
(
counter_offset
);
#else
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
.
mutex
());
rng_engine_inputs
=
at
::
check_generator
<
at
::
CUDAGeneratorImpl
>
(
gen
)
->
philox_engine_inputs
(
counter_offset
);
#endif
}
}
apex_dropout_add_kernel
<
scalar_t
,
accscalar_t
,
IndexType
><<<
grid
,
dim_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
inputs
,
add_inputs
,
outputs
,
mask
,
totalElements
,
p
,
rng_engine_inputs
);
apex_dropout_add_kernel
<
scalar_t
,
accscalar_t
,
IndexType
><<<
grid
,
dim_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
inputs
,
add_inputs
,
outputs
,
mask
,
totalElements
,
p
,
rng_engine_inputs
);
...
...
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
View file @
0bfb8300
...
@@ -1033,4 +1033,3 @@ void fused_maybe_adam_undo_cuda(
...
@@ -1033,4 +1033,3 @@ void fused_maybe_adam_undo_cuda(
}
}
THCudaCheck
(
cudaGetLastError
());
THCudaCheck
(
cudaGetLastError
());
}
}
apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
0 → 100644
View file @
0bfb8300
#include <torch/extension.h>
void
multi_tensor_lamb_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
const
float
lr
,
const
float
beta1
,
const
float
beta2
,
const
float
epsilon
,
const
int
step
,
const
int
bias_correction
,
const
float
weight_decay
,
const
int
grad_averaging
,
const
int
mode
,
const
float
global_grad_norm
,
const
float
max_grad_norm
);
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"lamb"
,
&
multi_tensor_lamb_cuda
,
"Computes and apply update for LAMB optimizer"
);
}
apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu
0 → 100644
View file @
0bfb8300
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
typedef
enum
{
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
}
adamMode_t
;
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
multi_tensor_l2norm_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
optional
<
bool
>
per_tensor_python
);
using
MATH_T
=
float
;
template
<
typename
T
>
struct
LAMBStage1Functor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
4
>&
tl
,
const
float
beta1
,
const
float
beta2
,
const
float
beta3
,
const
float
beta1_correction
,
const
float
beta2_correction
,
const
float
epsilon
,
adamMode_t
mode
,
const
float
decay
,
const
float
global_grad_norm
,
const
float
max_global_grad_norm
)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
float
clipped_global_grad_norm
=
global_grad_norm
>
max_global_grad_norm
?
global_grad_norm
/
max_global_grad_norm
:
1.0
f
;
T
*
g
=
(
T
*
)
tl
.
addresses
[
0
][
tensor_loc
];
g
+=
chunk_idx
*
chunk_size
;
T
*
p
=
(
T
*
)
tl
.
addresses
[
1
][
tensor_loc
];
p
+=
chunk_idx
*
chunk_size
;
T
*
m
=
(
T
*
)
tl
.
addresses
[
2
][
tensor_loc
];
m
+=
chunk_idx
*
chunk_size
;
T
*
v
=
(
T
*
)
tl
.
addresses
[
3
][
tensor_loc
];
v
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
// see note in multi_tensor_scale_kernel.cu
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_g
[
ii
]
=
g
[
i
];
// special ?optimization? for lamb stage 1
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
p
[
i
];
}
r_m
[
ii
]
=
m
[
i
];
r_v
[
ii
]
=
v
[
i
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
g
[
i
]
=
r_p
[
ii
];
m
[
i
]
=
r_m
[
ii
];
v
[
i
]
=
r_v
[
ii
];
}
}
}
}
};
// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
template
<
typename
T
>
struct
LAMBStage2Functor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
2
>&
tl
,
const
float
*
per_tensor_param_norm
,
const
float
*
per_tensor_update_norm
,
const
float
learning_rate
,
const
float
decay
)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
tensor_num
=
tl
.
start_tensor_this_launch
+
tensor_loc
;
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
MATH_T
ratio
=
learning_rate
;
// apply adaptive learning rate to parameters with non-zero weight decay
if
(
decay
!=
0.0
)
{
float
param_norm
=
per_tensor_param_norm
[
tensor_num
];
float
update_norm
=
per_tensor_update_norm
[
tensor_num
];
ratio
=
(
update_norm
!=
0.0
f
&&
param_norm
!=
0.0
f
)
?
learning_rate
*
(
param_norm
/
update_norm
)
:
learning_rate
;
}
T
*
update
=
(
T
*
)
tl
.
addresses
[
0
][
tensor_loc
];
update
+=
chunk_idx
*
chunk_size
;
T
*
p
=
(
T
*
)
tl
.
addresses
[
1
][
tensor_loc
];
p
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
MATH_T
r_p
[
ILP
];
MATH_T
r_update
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_p
[
ii
]
=
p
[
i
];
r_update
[
ii
]
=
update
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
p
[
i
]
=
r_p
[
ii
];
}
}
}
}
};
void
multi_tensor_lamb_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
const
float
lr
,
const
float
beta1
,
const
float
beta2
,
const
float
epsilon
,
const
int
step
,
const
int
bias_correction
,
const
float
weight_decay
,
const
int
grad_averaging
,
const
int
mode
,
const
float
global_grad_norm
,
const
float
max_grad_norm
)
{
using
namespace
at
;
// Master weight and 32bit momentum(potentially changing) is not handled by this
// So we assume every tensor are all in the same type
// Handle bias correction mode
float
bias_correction1
=
1.0
f
,
bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
bias_correction1
=
1
-
std
::
pow
(
beta1
,
step
);
bias_correction2
=
1
-
std
::
pow
(
beta2
,
step
);
}
// Handle grad averaging mode
float
beta3
=
1.0
f
;
if
(
grad_averaging
==
1
)
beta3
=
1
-
beta1
;
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
grad_list
(
tensor_lists
.
begin
(),
tensor_lists
.
begin
()
+
1
);
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
param_list
(
tensor_lists
.
begin
()
+
1
,
tensor_lists
.
begin
()
+
2
);
// Compute per tensor param norm
auto
param_norm_tuple
=
multi_tensor_l2norm_cuda
(
chunk_size
,
noop_flag
,
param_list
,
true
);
// We now in-place modify grad to store update before compute its norm
// Generally this is not a issue since people modify grad in step() method all the time
// We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"lamb_stage_1"
,
multi_tensor_apply
<
4
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
LAMBStage1Functor
<
scalar_t_0
>
(),
beta1
,
beta2
,
beta3
,
// 1-beta1 or 1 depends on averaging mode
bias_correction1
,
bias_correction2
,
epsilon
,
(
adamMode_t
)
mode
,
weight_decay
,
global_grad_norm
,
max_grad_norm
);
)
// Compute update norms
auto
update_norm_tuple
=
multi_tensor_l2norm_cuda
(
chunk_size
,
noop_flag
,
grad_list
,
true
);
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
grad_param_list
(
tensor_lists
.
begin
(),
tensor_lists
.
begin
()
+
2
);
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"lamb_stage_2"
,
multi_tensor_apply
<
2
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
grad_param_list
,
LAMBStage2Functor
<
scalar_t_0
>
(),
std
::
get
<
1
>
(
param_norm_tuple
).
DATA_PTR
<
float
>
(),
std
::
get
<
1
>
(
update_norm_tuple
).
DATA_PTR
<
float
>
(),
lr
,
weight_decay
);
)
AT_CUDA_CHECK
(
cudaGetLastError
());
}
apex/contrib/csrc/xentropy/xentropy_kernel.cu
View file @
0bfb8300
...
@@ -70,7 +70,6 @@
...
@@ -70,7 +70,6 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* POSSIBILITY OF SUCH DAMAGE.
*/
*/
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
...
@@ -84,6 +83,8 @@
...
@@ -84,6 +83,8 @@
#include "type_shim.h"
#include "type_shim.h"
#include "compat.h"
#include "compat.h"
#define ALIGN_BYTES 16
using
Tensor
=
at
::
Tensor
;
using
Tensor
=
at
::
Tensor
;
using
TensorList
=
at
::
TensorList
;
using
TensorList
=
at
::
TensorList
;
using
ScalarType
=
at
::
ScalarType
;
using
ScalarType
=
at
::
ScalarType
;
...
@@ -123,7 +124,7 @@ const int max_threads = 1024;
...
@@ -123,7 +124,7 @@ const int max_threads = 1024;
inline
dim3
SoftMax_getBlockSize
(
int
ILP
,
uint64_t
dim_size
)
{
inline
dim3
SoftMax_getBlockSize
(
int
ILP
,
uint64_t
dim_size
)
{
uint64_t
block_size
=
1
;
uint64_t
block_size
=
1
;
uint64_t
max_block_size
=
std
::
min
(
dim_size
/
ILP
,
static_cast
<
uint64_t
>
(
max_threads
));
uint64_t
max_block_size
=
std
::
min
(
dim_size
/
ILP
,
static_cast
<
uint64_t
>
(
max_threads
));
while
(
block_size
<
max_block_size
)
block_size
*=
2
;
while
(
block_size
<
(
max_block_size
/
2
)
)
block_size
*=
2
;
// Launch at least a single warp - the kernel assumes that.
// Launch at least a single warp - the kernel assumes that.
block_size
=
std
::
max
(
block_size
,
static_cast
<
uint64_t
>
(
32
));
block_size
=
std
::
max
(
block_size
,
static_cast
<
uint64_t
>
(
32
));
return
dim3
(
block_size
);
return
dim3
(
block_size
);
...
@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
...
@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
template
<
template
<
typename
,
typename
>
class
Reduction
,
int
ILP
,
typename
T
,
typename
AccumT
>
template
<
template
<
typename
,
typename
>
class
Reduction
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
AccumT
__device__
__forceinline__
AccumT
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
int
size
,
const
Reduction
<
T
,
AccumT
>&
r
,
const
Reduction
<
T
,
AccumT
>&
r
,
AccumT
defaultVal
)
AccumT
defaultVal
)
{
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal
=
defaultVal
;
AccumT
threadVal
=
defaultVal
;
int
offset
=
threadIdx
.
x
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal
=
r
(
threadVal
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
T
v
[
ILP
];
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
T
tmp
[
ILP
];
#pragma unroll
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
threadVal
=
r
(
threadVal
,
v
[
j
]);
threadVal
=
r
(
threadVal
,
tmp
[
j
]);
}
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
threadVal
=
r
(
threadVal
,
data
[
offset
]);
threadVal
=
r
(
threadVal
,
data
[
offset
]);
...
@@ -319,7 +331,8 @@ ilpReduce(T* data,
...
@@ -319,7 +331,8 @@ ilpReduce(T* data,
template
<
template
<
typename
,
typename
>
class
Reduction1
,
template
<
typename
,
typename
>
class
Reduction2
,
int
ILP
,
typename
T
,
typename
AccumT
>
template
<
template
<
typename
,
typename
>
class
Reduction1
,
template
<
typename
,
typename
>
class
Reduction2
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
void
__device__
__forceinline__
void
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
int
size
,
AccumT
*
reducVal1
,
AccumT
*
reducVal1
,
const
Reduction1
<
T
,
AccumT
>&
r1
,
const
Reduction1
<
T
,
AccumT
>&
r1
,
...
@@ -328,27 +341,38 @@ ilpReduce(T* data,
...
@@ -328,27 +341,38 @@ ilpReduce(T* data,
const
Reduction2
<
T
,
AccumT
>&
r2
,
const
Reduction2
<
T
,
AccumT
>&
r2
,
AccumT
defaultVal2
)
AccumT
defaultVal2
)
{
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal1
=
defaultVal1
;
AccumT
threadVal1
=
defaultVal1
;
AccumT
threadVal2
=
defaultVal2
;
AccumT
threadVal2
=
defaultVal2
;
int
offset
=
threadIdx
.
x
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
threadVal2
=
r2
(
threadVal2
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
T
v
[
ILP
];
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
T
tmp
[
ILP
];
#pragma unroll
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
threadVal1
=
r1
(
threadVal1
,
tmp
[
j
]);
threadVal1
=
r1
(
threadVal1
,
v
[
j
]);
threadVal2
=
r2
(
threadVal2
,
tmp
[
j
]);
threadVal2
=
r2
(
threadVal2
,
v
[
j
]);
}
}
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
{
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
{
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
...
@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
...
@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
// each block handles a sample in the mini-batch
// each block handles a sample in the mini-batch
input
+=
blockIdx
.
x
*
classes
;
input
+=
blockIdx
.
x
*
classes
;
//output += blockIdx.x * classes;
//output += blockIdx.x * classes;
const
int
shift
=
((
uint64_t
)
input
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
int64_t
label
=
labels
[
blockIdx
.
x
];
int64_t
label
=
labels
[
blockIdx
.
x
];
// find the max and sum
// find the max and sum
accscalar_t
threadMax
,
threadSum
,
max_k
,
sum_k
;
accscalar_t
threadMax
,
threadSum
,
max_k
,
sum_k
;
ilpReduce
<
MaxFloat
,
AddFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
ilpReduce
<
MaxFloat
,
AddFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
shift
,
input
,
classes
,
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
static_cast
<
accscalar_t
>
(
0
));
blockReduce
<
Max
,
Add
,
accscalar_t
>
(
blockReduce
<
Max
,
Add
,
accscalar_t
>
(
sdata
,
sdata
,
&
max_k
,
threadMax
,
Max
<
accscalar_t
>
(),
&
max_k
,
threadMax
,
Max
<
accscalar_t
>
(),
...
@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
...
@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
&
sum_k
,
threadSum
,
Add
<
accscalar_t
>
(),
&
sum_k
,
threadSum
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
static_cast
<
accscalar_t
>
(
0
));
// reduce all values
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
shift
,
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
sumAll
=
blockReduce
<
Add
,
accscalar_t
>
(
accscalar_t
sumAll
=
blockReduce
<
Add
,
accscalar_t
>
(
sdata
,
threadExp
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
sdata
,
threadExp
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
...
@@ -411,10 +435,9 @@ cunn_SoftMaxXEntropyForward(
...
@@ -411,10 +435,9 @@ cunn_SoftMaxXEntropyForward(
}
}
}
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__global__
void
__device__
__forceinline__
void
cunn_SoftMaxXEntropyBackward
(
apply
(
scalar_t
*
gradInput
,
scalar_t
*
gradInput
,
scalar_t
*
logits
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
outscalar_t
*
gradOutput
,
...
@@ -422,9 +445,6 @@ cunn_SoftMaxXEntropyBackward(
...
@@ -422,9 +445,6 @@ cunn_SoftMaxXEntropyBackward(
const
float
smoothing
,
const
float
smoothing
,
int
classes
)
int
classes
)
{
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
...
@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
...
@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
int
offset
=
threadIdx
.
x
;
int
offset
=
threadIdx
.
x
;
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
for
(;
offset
<
classes
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
for
(;
offset
<
classes
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
accscalar_t
tmpLogits
[
ILP
];
accscalar_t
tmpLogits
[
ILP
];
...
@@ -457,9 +478,99 @@ cunn_SoftMaxXEntropyBackward(
...
@@ -457,9 +478,99 @@ cunn_SoftMaxXEntropyBackward(
}
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__device__
__forceinline__
void
aligned_apply
(
int
shift
,
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
int64_t
label
=
labels
[
blockIdx
.
x
];
accscalar_t
coeff
=
max_log_sum_exp
[
blockIdx
.
x
];
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
logits
-=
shift
;
gradInput
-=
shift
;
classes
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
classes
-=
blockDim
.
x
;
gradInput
+=
blockDim
.
x
;
logits
+=
blockDim
.
x
;
shift
-=
blockDim
.
x
;
}
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
scalar_t
),
ILP
*
alignof
(
scalar_t
)
>::
type
LoadT
;
// input
scalar_t
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
// output
scalar_t
r
[
ILP
];
LoadT
*
result
=
reinterpret_cast
<
LoadT
*>
(
&
r
);
for
(;
offset
*
ILP
<
(
classes
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
logits
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
r
[
j
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
v
[
j
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
ILP
*
offset
+
j
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
reinterpret_cast
<
LoadT
*>
(
gradInput
)[
offset
]
=
*
result
;
}
offset
=
classes
-
last
+
threadIdx
.
x
;
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__global__
void
cunn_SoftMaxXEntropyBackward
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
// Do vectorized load/store when input/output have same alignment
const
int
shift
=
((
uint64_t
)
logits
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
const
int
shift_
=
((
uint64_t
)
gradInput
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
if
(
shift
==
shift_
){
aligned_apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
shift
,
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
else
{
apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
}
template
<
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
template
<
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
std
::
vector
<
Tensor
>
host_softmax_xentropy
(
std
::
vector
<
Tensor
>
host_softmax_xentropy
(
...
@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
...
@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
// XXX: it assumes that inner_size == 1
// XXX: it assumes that inner_size == 1
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
using
namespace
at
;
using
namespace
at
;
DISPATCH_FLOAT_AND_HALF
(
input
.
scalar_type
(),
0
,
"host_softmax_xentropy"
,
DISPATCH_FLOAT_AND_HALF
(
input
.
scalar_type
(),
0
,
"host_softmax_xentropy"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyForward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
cunn_SoftMaxXEntropyForward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
2
*
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
<<<
grid
,
block
,
2
*
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
...
@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
DISPATCH_FLOAT_AND_HALF
(
gI
.
scalar_type
(),
0
,
"host_softmax_xentropy_backward"
,
DISPATCH_FLOAT_AND_HALF
(
gI
.
scalar_type
(),
0
,
"host_softmax_xentropy_backward"
,
using
accscalar_t
=
acc_type
<
scalar_t_0
,
true
>
;
using
accscalar_t
=
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyBackward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
cunn_SoftMaxXEntropyBackward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
<<<
grid
,
block
,
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
...
apex/contrib/multihead_attn/self_multihead_attn_func.py
View file @
0bfb8300
...
@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
...
@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
values_grads
=
torch
.
bmm
(
dropout_results
.
transpose
(
1
,
2
),
output_lin_grads
,
out
=
values_grads
.
transpose
(
0
,
1
))
values_grads
=
torch
.
bmm
(
dropout_results
.
transpose
(
1
,
2
),
output_lin_grads
,
out
=
values_grads
.
transpose
(
0
,
1
))
# Mask and Scaling for Dropout (not a publically documented op)
# Mask and Scaling for Dropout (not a publically documented op)
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
dropout_prob_t
[
0
])
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
1.0
/
(
1.0
-
dropout_prob_t
[
0
])
)
# Softmax Grad (not a publically documented op)
# Softmax Grad (not a publically documented op)
softmax_grads
=
torch
.
_softmax_backward_data
(
dropout_grads
,
softmax_results
,
-
1
,
softmax_results
)
softmax_grads
=
torch
.
_softmax_backward_data
(
dropout_grads
,
softmax_results
,
-
1
,
softmax_results
)
...
...
apex/contrib/optimizers/__init__.py
View file @
0bfb8300
from
.fp16_optimizer
import
FP16_Optimizer
from
.fp16_optimizer
import
FP16_Optimizer
from
.fused_adam
import
FusedAdam
from
.fused_adam
import
FusedAdam
from
.fused_lamb
import
FusedLAMB
apex/contrib/optimizers/fp16_optimizer.py
View file @
0bfb8300
...
@@ -239,4 +239,5 @@ class FP16_Optimizer(object):
...
@@ -239,4 +239,5 @@ class FP16_Optimizer(object):
# constructed in the same way as the one whose state_dict we are loading, the same master params
# constructed in the same way as the one whose state_dict we are loading, the same master params
# are guaranteed to exist, so we can just copy_() from the saved master params.
# are guaranteed to exist, so we can just copy_() from the saved master params.
for
current
,
saved
in
zip
(
self
.
fp32_groups
,
state_dict
[
'fp32_groups'
]):
for
current
,
saved
in
zip
(
self
.
fp32_groups
,
state_dict
[
'fp32_groups'
]):
current
.
data
.
copy_
(
saved
.
data
)
for
_current
,
_saved
in
zip
(
current
,
saved
):
_current
.
data
.
copy_
(
_saved
.
data
)
apex/contrib/optimizers/fused_lamb.py
0 → 100644
View file @
0bfb8300
import
torch
import
importlib
import
math
from
apex.multi_tensor_apply
import
multi_tensor_applier
class
FusedLAMB
(
torch
.
optim
.
Optimizer
):
"""Implements LAMB algorithm.
Currently GPU-only. Requires Apex to be installed via
``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_lamb" ./``.
This version of fused LAMB implements 2 fusions.
* Fusion of the LAMB update's elementwise operations
* A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
:class:`apex.contrib.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::
opt = apex.contrib.optimizers.FusedLAMB(model.parameters(), lr = ....)
...
opt.step()
:class:`apex.optimizers.FusedLAMB` may be used with or without Amp. If you wish to use :class:`FusedLAMB` with Amp,
you may choose any ``opt_level``::
opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
...
opt.step()
In general, ``opt_level="O1"`` is recommended.
LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its norm. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
NOT SUPPORTED now! (default: False)
adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
True for decoupled weight decay(also known as AdamW) (default: True)
grad_averaging (bool, optional): whether apply (1-beta2) to grad when
calculating running averages of gradient. (default: True)
set_grad_none (bool, optional): whether set grad to None when zero_grad()
method is called. (default: True)
max_grad_norm (float, optional): value used to clip global grad norm
(default: 1.0)
.. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
https://arxiv.org/abs/1904.00962
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def
__init__
(
self
,
params
,
lr
=
1e-3
,
bias_correction
=
True
,
betas
=
(
0.9
,
0.999
),
eps
=
1e-6
,
weight_decay
=
0.01
,
amsgrad
=
False
,
adam_w_mode
=
True
,
grad_averaging
=
True
,
set_grad_none
=
True
,
max_grad_norm
=
1.0
):
if
amsgrad
:
raise
RuntimeError
(
'FusedLAMB does not support the AMSGrad variant.'
)
defaults
=
dict
(
lr
=
lr
,
bias_correction
=
bias_correction
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
grad_averaging
=
grad_averaging
,
max_grad_norm
=
max_grad_norm
)
super
(
FusedLAMB
,
self
).
__init__
(
params
,
defaults
)
if
multi_tensor_applier
.
available
:
import
amp_C
self
.
multi_tensor_l2norm
=
amp_C
.
multi_tensor_l2norm
self
.
_dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
])
fused_lamb_cuda
=
importlib
.
import_module
(
"fused_lamb_cuda"
)
self
.
multi_tensor_lamb
=
fused_lamb_cuda
.
lamb
else
:
raise
RuntimeError
(
'apex.contrib.optimizers.FusedLAMB requires cuda extensions'
)
self
.
adam_w_mode
=
1
if
adam_w_mode
else
0
self
.
set_grad_none
=
set_grad_none
def
zero_grad
(
self
):
if
self
.
set_grad_none
:
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
p
.
grad
=
None
else
:
super
(
FusedLAMB
,
self
).
zero_grad
()
def
step
(
self
,
closure
=
None
):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss
=
None
if
closure
is
not
None
:
loss
=
closure
()
# create separate grad lists for fp32 and fp16 params
g_all_32
,
g_all_16
=
[],
[]
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
if
p
.
grad
is
None
:
continue
if
p
.
dtype
==
torch
.
float32
:
g_all_32
.
append
(
p
.
grad
.
data
)
elif
p
.
dytpe
==
torch
.
float16
:
g_all_16
.
append
(
p
.
grad
.
data
)
else
:
raise
RuntimeError
(
'FusedLAMB only support fp16 and fp32.'
)
g_norm_32
,
g_norm_16
=
0.0
,
0.0
# compute grad norm for two lists
if
len
(
g_all_32
)
>
0
:
g_norm_32
=
multi_tensor_applier
(
self
.
multi_tensor_l2norm
,
self
.
_dummy_overflow_buf
,
[
g_all_32
],
False
)[
0
].
item
()
if
len
(
g_all_16
)
>
0
:
g_norm_16
=
multi_tensor_applier
(
self
.
multi_tensor_l2norm
,
self
.
_dummy_overflow_buf
,
[
g_all_16
],
False
)[
0
].
item
()
# blend two grad norms to get global grad norm
global_grad_norm
=
math
.
sqrt
(
g_norm_32
*
g_norm_32
+
g_norm_16
*
g_norm_16
)
max_grad_norm
=
self
.
defaults
[
'max_grad_norm'
]
for
group
in
self
.
param_groups
:
bias_correction
=
1
if
group
[
'bias_correction'
]
else
0
beta1
,
beta2
=
group
[
'betas'
]
grad_averaging
=
1
if
group
[
'grad_averaging'
]
else
0
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if
'step'
in
group
:
group
[
'step'
]
+=
1
else
:
group
[
'step'
]
=
1
# create lists for multi-tensor apply
g_16
,
p_16
,
m_16
,
v_16
=
[],
[],
[],
[]
g_32
,
p_32
,
m_32
,
v_32
=
[],
[],
[],
[]
for
p
in
group
[
'params'
]:
if
p
.
grad
is
None
:
continue
if
p
.
grad
.
data
.
is_sparse
:
raise
RuntimeError
(
'FusedLAMB does not support sparse gradients, please consider SparseAdam instead'
)
state
=
self
.
state
[
p
]
# State initialization
if
len
(
state
)
==
0
:
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
# Exponential moving average of gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
if
p
.
dtype
==
torch
.
float16
:
g_16
.
append
(
p
.
grad
.
data
)
p_16
.
append
(
p
.
data
)
m_16
.
append
(
state
[
'exp_avg'
])
v_16
.
append
(
state
[
'exp_avg_sq'
])
elif
p
.
dtype
==
torch
.
float32
:
g_32
.
append
(
p
.
grad
.
data
)
p_32
.
append
(
p
.
data
)
m_32
.
append
(
state
[
'exp_avg'
])
v_32
.
append
(
state
[
'exp_avg_sq'
])
else
:
raise
RuntimeError
(
'FusedLAMB only support fp16 and fp32.'
)
if
(
len
(
g_16
)
>
0
):
multi_tensor_applier
(
self
.
multi_tensor_lamb
,
self
.
_dummy_overflow_buf
,
[
g_16
,
p_16
,
m_16
,
v_16
],
group
[
'lr'
],
beta1
,
beta2
,
group
[
'eps'
],
group
[
'step'
],
bias_correction
,
group
[
'weight_decay'
],
grad_averaging
,
self
.
adam_w_mode
,
global_grad_norm
,
max_grad_norm
)
if
(
len
(
g_32
)
>
0
):
multi_tensor_applier
(
self
.
multi_tensor_lamb
,
self
.
_dummy_overflow_buf
,
[
g_32
,
p_32
,
m_32
,
v_32
],
group
[
'lr'
],
beta1
,
beta2
,
group
[
'eps'
],
group
[
'step'
],
bias_correction
,
group
[
'weight_decay'
],
grad_averaging
,
self
.
adam_w_mode
,
global_grad_norm
,
max_grad_norm
)
return
loss
apex/mlp/__init__.py
0 → 100644
View file @
0bfb8300
from
.mlp
import
*
apex/mlp/mlp.py
0 → 100644
View file @
0bfb8300
from
copy
import
copy
import
math
import
torch
from
torch
import
nn
import
mlp_cuda
from
..
import
amp
class
MlpFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
bias
,
activation
,
*
args
):
output
=
mlp_cuda
.
forward
(
bias
,
activation
,
args
)
ctx
.
save_for_backward
(
*
args
)
ctx
.
outputs
=
output
ctx
.
bias
=
bias
ctx
.
activation
=
activation
return
output
[
0
]
@
staticmethod
def
backward
(
ctx
,
grad_o
):
grads
=
mlp_cuda
.
backward
(
ctx
.
bias
,
ctx
.
activation
,
grad_o
,
ctx
.
outputs
,
ctx
.
saved_tensors
)
del
ctx
.
outputs
return
(
None
,
None
,
*
grads
)
mlp_function
=
amp
.
half_function
(
MlpFunction
.
apply
)
class
MLP
(
torch
.
nn
.
Module
):
"""Launch MLP in C++
Args:
mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024
bias (bool): Default True:
relu (bool): Default True
"""
def
__init__
(
self
,
mlp_sizes
,
bias
=
True
,
activation
=
'relu'
):
super
(
MLP
,
self
).
__init__
()
self
.
num_layers
=
len
(
mlp_sizes
)
-
1
self
.
mlp_sizes
=
copy
(
mlp_sizes
)
self
.
bias
=
1
if
bias
else
0
if
activation
is
'none'
:
self
.
activation
=
0
elif
activation
is
'relu'
:
self
.
activation
=
1
elif
activation
is
'sigmoid'
:
self
.
activation
=
2
else
:
raise
TypeError
(
"activation must be relu or none."
)
self
.
weights
=
[]
self
.
biases
=
[]
for
i
in
range
(
self
.
num_layers
):
w
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
],
mlp_sizes
[
i
]))
self
.
weights
.
append
(
w
)
name
=
'weight_{}'
.
format
(
i
)
setattr
(
self
,
name
,
w
)
if
self
.
bias
:
b
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
]))
self
.
biases
.
append
(
b
)
name
=
'bias_{}'
.
format
(
i
)
setattr
(
self
,
name
,
b
)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
for
weight
in
self
.
weights
:
dimsum
=
weight
.
size
(
0
)
+
weight
.
size
(
1
)
std
=
math
.
sqrt
(
2.
/
float
(
dimsum
))
nn
.
init
.
normal_
(
weight
,
0.
,
std
)
if
self
.
bias
:
for
bias
in
self
.
biases
:
std
=
math
.
sqrt
(
1.
/
float
(
bias
.
size
(
0
)))
nn
.
init
.
normal_
(
bias
,
0.
,
std
)
def
forward
(
self
,
input
):
return
mlp_function
(
self
.
bias
,
self
.
activation
,
input
,
*
self
.
weights
,
*
self
.
biases
)
def
extra_repr
(
self
):
s
=
F
"MLP sizes:
{
self
.
mlp_sizes
}
, Bias=
{
self
.
bias
}
, activation=
{
self
.
activation
}
"
return
s
apex/parallel/LARC.py
View file @
0bfb8300
...
@@ -37,7 +37,6 @@ class LARC(object):
...
@@ -37,7 +37,6 @@ class LARC(object):
"""
"""
def
__init__
(
self
,
optimizer
,
trust_coefficient
=
0.02
,
clip
=
True
,
eps
=
1e-8
):
def
__init__
(
self
,
optimizer
,
trust_coefficient
=
0.02
,
clip
=
True
,
eps
=
1e-8
):
self
.
param_groups
=
optimizer
.
param_groups
self
.
optim
=
optimizer
self
.
optim
=
optimizer
self
.
trust_coefficient
=
trust_coefficient
self
.
trust_coefficient
=
trust_coefficient
self
.
eps
=
eps
self
.
eps
=
eps
...
@@ -49,9 +48,21 @@ class LARC(object):
...
@@ -49,9 +48,21 @@ class LARC(object):
def
__setstate__
(
self
,
state
):
def
__setstate__
(
self
,
state
):
self
.
optim
.
__setstate__
(
state
)
self
.
optim
.
__setstate__
(
state
)
@
property
def
state
(
self
):
return
self
.
optim
.
state
def
__repr__
(
self
):
def
__repr__
(
self
):
return
self
.
optim
.
__repr__
()
return
self
.
optim
.
__repr__
()
@
property
def
param_groups
(
self
):
return
self
.
optim
.
param_groups
@
param_groups
.
setter
def
param_groups
(
self
,
value
):
self
.
optim
.
param_groups
=
value
def
state_dict
(
self
):
def
state_dict
(
self
):
return
self
.
optim
.
state_dict
()
return
self
.
optim
.
state_dict
()
...
...
apex/pyprof/__init__.py
View file @
0bfb8300
import
warnings
import
warnings
from
.
import
nvtx
from
.
import
nvtx
,
prof
apex/pyprof/prof/__init__.py
View file @
0bfb8300
from
.
import
data
,
prof
apex/pyprof/prof/utility.py
View file @
0bfb8300
...
@@ -9,7 +9,7 @@ class Utility(object):
...
@@ -9,7 +9,7 @@ class Utility(object):
@
staticmethod
@
staticmethod
def
typeToBytes
(
t
):
def
typeToBytes
(
t
):
if
(
t
in
[
"uint8"
,
"int8"
,
"byte"
,
"char"
]):
if
(
t
in
[
"uint8"
,
"int8"
,
"byte"
,
"char"
,
"bool"
]):
return
1
return
1
elif
(
t
in
[
"float16"
,
"half"
,
"int16"
,
"short"
]):
elif
(
t
in
[
"float16"
,
"half"
,
"int16"
,
"short"
]):
return
2
return
2
...
@@ -21,7 +21,7 @@ class Utility(object):
...
@@ -21,7 +21,7 @@ class Utility(object):
@
staticmethod
@
staticmethod
def
typeToString
(
t
):
def
typeToString
(
t
):
if
(
t
in
[
"uint8"
,
"byte"
,
"char"
]):
if
(
t
in
[
"uint8"
,
"byte"
,
"char"
,
]):
return
"uint8"
return
"uint8"
elif
(
t
in
[
"int8"
,]):
elif
(
t
in
[
"int8"
,]):
return
"int8"
return
"int8"
...
@@ -37,6 +37,8 @@ class Utility(object):
...
@@ -37,6 +37,8 @@ class Utility(object):
return
"int64"
return
"int64"
elif
(
t
in
[
"float64"
,
"double"
,]):
elif
(
t
in
[
"float64"
,
"double"
,]):
return
"fp64"
return
"fp64"
elif
(
t
in
[
"bool"
,]):
return
"bool"
assert
False
assert
False
@
staticmethod
@
staticmethod
...
...
csrc/mlp.cpp
0 → 100644
View file @
0bfb8300
#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>
#include <stdio.h>
size_t
get_mlp_reserved_space
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
);
template
<
typename
T
>
size_t
get_mlp_bp_workspace_in_bytes
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
);
template
<
typename
T
>
int
mlp_fp
(
T
*
X
,
int
input_features
,
int
batch_size
,
T
**
WPtr
,
int
num_layers
,
int
*
output_features
,
T
**
BPtr
,
T
*
Y
,
T
*
reserved_space
,
int
use_bias
,
int
activation
);
template
<
typename
T
>
int
mlp_bp
(
T
*
X
,
T
*
Y
,
int
input_features
,
int
batch_size
,
T
**
WPtr
,
int
num_layers
,
int
*
output_features
,
T
*
dY
,
T
*
reserved_space
,
T
*
work_space
,
T
*
dX
,
T
**
dwPtr
,
T
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
std
::
vector
<
at
::
Tensor
>
mlp_forward
(
int
use_bias
,
int
activation
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
auto
num_layers
=
inputs
.
size
()
-
1
;
if
(
use_bias
)
{
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
std
::
vector
<
int
>
output_features
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
output_features
.
push_back
(
inputs
[
i
+
1
].
size
(
0
));
}
auto
reserved_size
=
get_mlp_reserved_space
(
batch_size
,
num_layers
,
output_features
.
data
());
// create output/workspace tensor
// TODO(deyuf): just get buffer?
auto
out
=
at
::
empty
({
batch_size
,
output_features
.
back
()},
inputs
[
0
].
type
());
auto
reserved_space
=
at
::
empty
({
reserved_size
},
inputs
[
0
].
type
());
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_forward"
,
[
&
]
{
std
::
vector
<
scalar_t
*>
w_ptr
;
std
::
vector
<
scalar_t
*>
b_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
if
(
use_bias
)
{
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
}
}
auto
result
=
mlp_fp
<
scalar_t
>
(
inputs
[
0
].
data_ptr
<
scalar_t
>
(),
input_features
,
batch_size
,
w_ptr
.
data
(),
num_layers
,
output_features
.
data
(),
b_ptr
.
data
(),
out
.
data_ptr
<
scalar_t
>
(),
reserved_space
.
data_ptr
<
scalar_t
>
(),
use_bias
,
activation
);
});
return
{
out
,
reserved_space
};
}
std
::
vector
<
at
::
Tensor
>
mlp_backward
(
int
use_bias
,
int
activation
,
at
::
Tensor
grad_o
,
std
::
vector
<
at
::
Tensor
>
fprop_outputs
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
auto
num_layers
=
inputs
.
size
()
-
1
;
if
(
use_bias
)
{
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
// TODO: not creating empty tensor for it?
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
vector
<
int
>
output_features
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
output_features
.
push_back
(
inputs
[
i
+
1
].
size
(
0
));
}
// create outputs, length of inputs
// TODO: not create bias if not needed
std
::
vector
<
at
::
Tensor
>
outputs
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
outputs
.
push_back
(
at
::
empty
(
inputs
[
i
].
sizes
(),
inputs
[
i
].
type
()));
// clone for testing now
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_backward"
,
[
&
]
{
std
::
vector
<
scalar_t
*>
w_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
}
std
::
vector
<
scalar_t
*>
outputs_ptr
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
outputs_ptr
.
push_back
(
outputs
[
i
].
data_ptr
<
scalar_t
>
());
}
auto
work_size
=
get_mlp_bp_workspace_in_bytes
<
scalar_t
>
(
batch_size
,
num_layers
,
output_features
.
data
());
// auto work_space = at::empty({work_size*4}, at::kByte);
auto
work_space
=
at
::
empty
({
work_size
/
sizeof
(
scalar_t
)},
inputs
[
0
].
type
());
auto
result
=
mlp_bp
<
scalar_t
>
(
inputs
[
0
].
data_ptr
<
scalar_t
>
(),
fprop_outputs
[
0
].
data_ptr
<
scalar_t
>
(),
input_features
,
batch_size
,
w_ptr
.
data
(),
num_layers
,
output_features
.
data
(),
grad_o
.
contiguous
().
data_ptr
<
scalar_t
>
(),
fprop_outputs
[
1
].
data_ptr
<
scalar_t
>
(),
work_space
.
data_ptr
<
scalar_t
>
(),
outputs_ptr
[
0
],
outputs_ptr
.
data
()
+
1
,
outputs_ptr
.
data
()
+
1
+
num_layers
,
requires_grad
,
use_bias
,
activation
);
});
return
outputs
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
mlp_forward
,
"MLP forward"
);
m
.
def
(
"backward"
,
&
mlp_backward
,
"MLP backward"
);
}
csrc/mlp_cuda.cu
0 → 100644
View file @
0bfb8300
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <torch/torch.h>
/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
// constants for fused bias+relu kernel
#define BIAS_RELU_FW_NTHREADS 128 // forward number of thread per block
#define BIAS_RELU_BW_NTHREADS_X 32 // backward number of thread in feature dim
#define BIAS_RELU_BW_NTHREADS_Y 16 // backward number of thread in batch dim
#define BIAS_RELU_RED_PER_THREAD 16 // backward minimal reduction length per thread
// move to a header later on
#define ILP 4
template
<
typename
T
>
__host__
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
volatile
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
volatile
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
// Keep ReLU in float only. When using half, cast to float before calling.
__device__
__inline__
float
relu
(
float
a
)
{
float
retf
=
max
(
a
,
0.
f
);
return
(
retf
);
}
// Keep Sigmoid in float only. When using half, cast to float before calling.
__device__
__inline__
float
sigmoid
(
float
a
)
{
float
retf
=
1.
f
/
(
1.
f
+
expf
(
-
a
));
return
(
retf
);
}
// FP64 Wrapper around cublas GEMMEx
cublasStatus_t
mlp_gemm
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
float
*
alpha
,
const
double
*
A
,
int
lda
,
const
double
*
B
,
int
ldb
,
const
float
*
beta
,
double
*
C
,
int
ldc
)
{
return
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_64F
,
lda
,
B
,
CUDA_R_64F
,
ldb
,
beta
,
C
,
CUDA_R_64F
,
ldc
,
CUDA_R_64F
,
CUBLAS_GEMM_DEFAULT
);
}
// FP32 Wrapper around cublas GEMMEx
cublasStatus_t
mlp_gemm
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_32F
,
lda
,
B
,
CUDA_R_32F
,
ldb
,
beta
,
C
,
CUDA_R_32F
,
ldc
,
CUDA_R_32F
,
CUBLAS_GEMM_DEFAULT
);
}
// FP16 Tensor core wrapper around cublas GEMMEx
cublasStatus_t
mlp_gemm
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
float
*
alpha
,
const
at
::
Half
*
A
,
int
lda
,
const
at
::
Half
*
B
,
int
ldb
,
float
*
beta
,
at
::
Half
*
C
,
int
ldc
)
{
return
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_16F
,
lda
,
B
,
CUDA_R_16F
,
ldb
,
beta
,
C
,
CUDA_R_16F
,
ldc
,
CUDA_R_32F
,
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
}
// Bias ADD. Assume input X is [features x batch size], column major.
// Bias is one 'features' long vector, with implicit broadcast.
template
<
typename
T
>
__global__
void
biasAdd_fprop
(
T
*
X
,
T
*
b
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
T
r_b
[
ILP
];
if
(
is_aligned
(
X
)
&&
is_aligned
(
b
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
row
=
tid
%
(
features
/
ILP
);
load_store
(
r_x
,
X
,
0
,
tid
);
load_store
(
r_b
,
b
,
0
,
row
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
bias_sum
;
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
int
row
=
tid
%
features
;
r_x
[
ii
]
=
X
[
idx
];
r_b
[
ii
]
=
b
[
row
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
bias_sum
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// Bias ADD + ReLU. Assume input X is [features x batch size], column major.
// Activation support fuesed ReLU. Safe to call in-place.
template
<
typename
T
>
__global__
void
biasAddRelu_fprop
(
T
*
X
,
T
*
b
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
T
r_b
[
ILP
];
if
(
is_aligned
(
X
)
&&
is_aligned
(
b
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
row
=
tid
%
(
features
/
ILP
);
load_store
(
r_x
,
X
,
0
,
tid
);
load_store
(
r_b
,
b
,
0
,
row
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
relu
(
bias_sum
);
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
int
row
=
tid
%
features
;
r_x
[
ii
]
=
X
[
idx
];
r_b
[
ii
]
=
b
[
row
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
relu
(
bias_sum
);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Relu_fprop
(
T
*
X
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
if
(
is_aligned
(
X
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_x
,
X
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
relu
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_x
[
ii
]
=
X
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
relu
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Sigmoid_fprop
(
T
*
X
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
if
(
is_aligned
(
X
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_x
,
X
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
sigmoid
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_x
[
ii
]
=
X
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
sigmoid
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Relu_bprop
(
T
*
dY
,
T
*
Y
,
uint
batch_size
,
uint
features
,
T
*
dX
)
{
T
r_dy
[
ILP
];
T
r_y
[
ILP
];
if
(
is_aligned
(
dY
)
&&
is_aligned
(
Y
)
&&
is_aligned
(
dX
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_dy
,
dY
,
0
,
tid
);
load_store
(
r_y
,
Y
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
}
load_store
(
dX
,
r_dy
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_dy
[
ii
]
=
dY
[
idx
];
r_y
[
ii
]
=
Y
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
dX
[
idx
]
=
r_dy
[
ii
];
}
}
}
}
}
// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Sigmoid_bprop
(
T
*
dY
,
T
*
Y
,
uint
batch_size
,
uint
features
,
T
*
dX
)
{
T
r_dy
[
ILP
];
T
r_y
[
ILP
];
if
(
is_aligned
(
dY
)
&&
is_aligned
(
Y
)
&&
is_aligned
(
dX
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_dy
,
dY
,
0
,
tid
);
load_store
(
r_y
,
Y
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
float
grad_out
=
r_dy
[
ii
];
float
out
=
r_y
[
ii
];
float
grad_i
=
out
*
(
1.
f
-
out
)
*
grad_out
;
r_dy
[
ii
]
=
grad_i
;
}
load_store
(
dX
,
r_dy
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_dy
[
ii
]
=
dY
[
idx
];
r_y
[
ii
]
=
Y
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
grad_out
=
r_dy
[
ii
];
float
out
=
r_y
[
ii
];
float
grad_i
=
out
*
(
1.
f
-
out
)
*
grad_out
;
r_dy
[
ii
]
=
grad_i
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
dX
[
idx
]
=
r_dy
[
ii
];
}
}
}
}
}
// Compute grid size for pointwise backward kernel.
// block_x/y is total elment being handled per block, not number of threads
void
get_biasAddRelu_bprop_grid_size
(
int
yfeat
,
int
batch_size
,
int
block_x
,
int
block_y
,
int
*
grid_x
,
int
*
grid_y
)
{
*
grid_x
=
(
yfeat
+
block_x
-
1
)
/
block_x
;
// Get number of SMs for efficient reduction.
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
// can switch to occupancy calculation. use 4 below now for sm_70
int
max_blocks_y
=
num_SMs
*
4
/
(
*
grid_x
);
// block_y should be from minimal work per thread
int
nRedSplits
=
(
batch_size
+
block_y
-
1
)
/
block_y
;
// increase number of elem per thread redcution to not launch more than enough
// kernel adjust work, so here we just launch max block
*
grid_y
=
std
::
min
(
nRedSplits
,
max_blocks_y
);
return
;
}
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template
<
typename
T
,
int
UNROLL_FACTOR
>
__global__
void
biasAdd_bprop
(
T
*
dY
,
int
features
,
int
batch_size
,
volatile
float
*
intermediate
,
int
*
semaphores
,
T
*
db
)
{
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the span this thread is responsible for
// For this block
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
__shared__
bool
isLastBlock
;
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
];
// Accumulate db in FP32 always
float
db_local
=
0
;
if
(
f
<
features
)
{
int
nidx
=
0
;
// Handle non-multiple of UNROLL_FACTOR residue
for
(;
nidx
<
nSpan
%
UNROLL_FACTOR
;
nidx
++
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
db_local
+=
(
float
)
dY
[
flat_idx
];
}
// Handle meat of work
for
(;
(
nidx
+
UNROLL_FACTOR
-
1
)
<
nSpan
;
nidx
+=
UNROLL_FACTOR
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
#pragma unroll 4
for
(
int
u
=
0
;
u
<
UNROLL_FACTOR
;
u
++
)
{
db_local
+=
(
float
)
dY
[
flat_idx
];
flat_idx
+=
features
;
}
}
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
smem
[
linear_idx
]
=
db_local
;
}
__syncthreads
();
if
(
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
db_local
+=
smem
[
yidx
*
blockDim
.
x
+
threadIdx
.
x
];
}
// block result is in db_local now for all threadIdx.y == 0
// Write out partial result
out
[
f
]
=
db_local
;
}
}
__threadfence
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in the grid_y dimension.
// Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
&&
f
<
features
)
{
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
}
__syncthreads
();
db_local
=
0
;
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
&&
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
}
db
[
f
]
=
(
T
)
db_local
;
}
}
}
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template
<
typename
T
,
int
UNROLL_FACTOR
>
__global__
void
biasAddRelu_bprop
(
T
*
Y
,
T
*
dY
,
int
features
,
int
batch_size
,
T
*
dX
,
volatile
float
*
intermediate
,
int
*
semaphores
,
T
*
db
)
{
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the span this thread is responsible for
// For this block
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
__shared__
bool
isLastBlock
;
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
];
// Accumulate db in FP32 always
float
db_local
=
0
;
if
(
f
<
features
)
{
int
nidx
=
0
;
// Handle non-multiple of UNROLL_FACTOR residue
for
(;
nidx
<
nSpan
%
UNROLL_FACTOR
;
nidx
++
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
T
y_val
=
Y
[
flat_idx
];
T
dy_val
=
dY
[
flat_idx
];
T
dx_val
;
if
((
float
)
y_val
>
0.
f
)
dx_val
=
dy_val
;
else
dx_val
=
0
;
dX
[
flat_idx
]
=
dx_val
;
db_local
+=
(
float
)
dx_val
;
}
// Handle meat of work
for
(;
(
nidx
+
UNROLL_FACTOR
-
1
)
<
nSpan
;
nidx
+=
UNROLL_FACTOR
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
#pragma unroll 4
for
(
int
u
=
0
;
u
<
UNROLL_FACTOR
;
u
++
)
{
T
y_val
=
Y
[
flat_idx
];
T
dy_val
=
dY
[
flat_idx
];
T
dx_val
;
if
((
float
)
y_val
>
0.
f
)
dx_val
=
dy_val
;
else
dx_val
=
0
;
dX
[
flat_idx
]
=
dx_val
;
db_local
+=
(
float
)
dx_val
;
flat_idx
+=
features
;
}
}
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
smem
[
linear_idx
]
=
db_local
;
}
__syncthreads
();
if
(
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
db_local
+=
smem
[
yidx
*
blockDim
.
x
+
threadIdx
.
x
];
}
// block result is in db_local now for all threadIdx.y == 0
// Write out partial result
out
[
f
]
=
db_local
;
}
}
__threadfence
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in the grid_y dimension.
// Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
&&
f
<
features
)
{
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
}
__syncthreads
();
db_local
=
0
;
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
&&
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
}
db
[
f
]
=
(
T
)
db_local
;
}
}
}
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template
<
typename
T
,
int
UNROLL_FACTOR
>
__global__
void
biasAddRelu_bprop_aligned
(
T
*
Y
,
T
*
dY
,
int
features
,
int
batch_size
,
T
*
dX
,
volatile
float
*
intermediate
,
int
*
semaphores
,
T
*
db
)
{
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the span this thread is responsible for
// For this block
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
__shared__
bool
isLastBlock
;
// Accumulate db in FP32 always
float
db_local
[
ILP
];
T
r_y
[
ILP
];
T
r_dy
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
=
0.
f
;
}
// f always <= features in this case
//if (f < features) {
int
nidx
=
0
;
// Handle non-multiple of UNROLL_FACTOR residue
for
(;
nidx
<
nSpan
%
UNROLL_FACTOR
;
nidx
++
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
/
ILP
+
row
;
load_store
(
r_y
,
Y
,
0
,
flat_idx
);
load_store
(
r_dy
,
dY
,
0
,
flat_idx
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
db_local
[
ii
]
+=
(
float
)
r_dy
[
ii
];
}
load_store
(
dX
,
r_dy
,
flat_idx
,
0
);
}
// Handle meat of work
for
(;
(
nidx
+
UNROLL_FACTOR
-
1
)
<
nSpan
;
nidx
+=
UNROLL_FACTOR
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
/
ILP
+
row
;
// total threads in x == features/ILP
#pragma unroll
for
(
int
u
=
0
;
u
<
UNROLL_FACTOR
;
u
++
)
{
load_store
(
r_y
,
Y
,
0
,
flat_idx
);
load_store
(
r_dy
,
dY
,
0
,
flat_idx
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
db_local
[
ii
]
+=
(
float
)
r_dy
[
ii
];
}
load_store
(
dX
,
r_dy
,
flat_idx
,
0
);
flat_idx
+=
features
/
ILP
;
}
}
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
*
ILP
];
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
float
*
smem_out
=
smem
+
ILP
*
linear_idx
;
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
smem_out
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
float
*
smem_in
=
smem
+
ILP
*
(
yidx
*
blockDim
.
x
+
threadIdx
.
x
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
+=
smem_in
[
ii
];
// reuse local dy buffer
}
}
// block result is in db_local now for all threadIdx.y == 0
// TODO: maybe not useful early exit here
if
(
gridDim
.
y
==
1
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
return
;
}
// Write out partial result
load_store
(
out
,
db_local
,
f
,
0
);
}
__threadfence
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in the grid_y dimension.
// Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
}
__syncthreads
();
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
=
0.
f
;
}
float
r_db
[
ILP
];
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
)
{
if
(
threadIdx
.
y
==
0
){
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
load_store
(
r_db
,
intermediate
,
0
,
col
*
features
/
ILP
+
row
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
+=
r_db
[
ii
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
}
}
}
// Lists where the num_layers-1 intermediate Y buffers start in reserved space on fprop, starting
// offset 0. The last Y value is, of course, stored in the user provided output buffer.
void
get_y_offsets
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
,
int
*
y_start_offsets
)
{
y_start_offsets
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
num_layers
;
i
++
)
{
y_start_offsets
[
i
]
=
y_start_offsets
[
i
-
1
]
+
batch_size
*
output_features
[
i
-
1
];
}
}
// Returns the reserved space (in elements) needed for the MLP
size_t
get_mlp_reserved_space
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
)
{
size_t
res_space
=
0
;
// Need to store output of every intermediate MLP - size equal to output_features[i] * batch_size
// for all 'i' in [0, num_layers-1)
for
(
int
l
=
0
;
l
<
num_layers
;
l
++
)
{
res_space
+=
output_features
[
l
]
*
batch_size
;
}
return
res_space
;
}
// Returns the size of all fprop activations combined
size_t
get_all_activations_size
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
)
{
size_t
acts_size
=
0
;
for
(
int
l
=
0
;
l
<
num_layers
;
l
++
)
{
acts_size
+=
output_features
[
l
]
*
batch_size
;
}
return
acts_size
;
}
#if 0
// Returns the work space (in elements) needed for the MLP bprop.
size_t get_mlp_bp_workspace (int batch_size, int num_layers, const int* output_features) {
/*
Workspace is partitioned as
DY_GEMMs : DX_GEMMs
*/
size_t work_space = 0;
// Store each intermediate dY explicitly. Need 2 dYs per MLP layer (one for o/p
// of biasReLU_bp and one for o/p of dgrad GEMM).
work_space += 2*get_all_activations_size(batch_size, num_layers, output_features);
return work_space;
}
#endif
// Scratch space needed for reductions in number of elements
size_t
get_reduction_scratch_space
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
)
{
size_t
max_scratch_space
=
0
;
// Loop over all layers to see which one needs the max scratch space
for
(
int
l
=
0
;
l
<
num_layers
;
l
++
)
{
// need to find max(aligned, not_aligned)
int
tmp
,
res0
,
res1
;
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
output_features
[
l
],
batch_size
,
block_x
,
block_y
,
&
tmp
,
&
res0
);
block_x
=
ILP
*
BIAS_RELU_BW_NTHREADS_X
;
get_biasAddRelu_bprop_grid_size
(
output_features
[
l
],
batch_size
,
block_x
,
block_y
,
&
tmp
,
&
res1
);
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
res0
));
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
res1
));
}
return
max_scratch_space
;
}
// Buffer for semaphores
size_t
get_semaphores_size
(
int
num_layers
,
const
int
*
output_features
)
{
// Upper bound on semaphores is one per feature for the layer
// with the most features.
int
max_features
=
0
;
for
(
int
l
=
0
;
l
<
num_layers
;
l
++
)
{
max_features
=
std
::
max
(
max_features
,
output_features
[
l
]);
}
return
(
size_t
)
max_features
;
}
// Returns the work space (in elements) needed for the MLP bprop.
template
<
typename
T
>
size_t
get_mlp_bp_workspace_in_bytes
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
)
{
size_t
work_space
=
0
;
// Store each intermediate dY explicitly. Need 2 dYs per MLP layer (one for o/p
// of biasReLU_bp and one for o/p of dgrad GEMM).
work_space
+=
2
*
get_all_activations_size
(
batch_size
,
num_layers
,
output_features
)
*
sizeof
(
T
);
work_space
+=
get_reduction_scratch_space
(
batch_size
,
num_layers
,
output_features
)
*
sizeof
(
float
);
work_space
+=
get_semaphores_size
(
num_layers
,
output_features
)
*
sizeof
(
int
);
return
work_space
;
}
// Returns pointers to each segment of the workspace
template
<
typename
T
>
void
partition_mlp_bp_workspace
(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
,
void
*
work_space
,
T
**
dy_gemms
,
T
**
dx_gemms
,
float
**
db_scratch
,
int
**
semaphores
)
{
/*
Workspace is partitioned as
DY_GEMMs : DX_GEMMs : DB_SCRATCH : SEMAPHORES
*/
// Start address where dy_gemm tensors are stored
*
dy_gemms
=
reinterpret_cast
<
T
*>
(
work_space
);
// Start address where dx_gemm tensors are stored
*
dx_gemms
=
*
dy_gemms
+
get_all_activations_size
(
batch_size
,
num_layers
,
output_features
);
// Start address where db intermediate tensors are stored
*
db_scratch
=
reinterpret_cast
<
float
*>
(
*
dx_gemms
+
get_all_activations_size
(
batch_size
,
num_layers
,
output_features
));
// Start address of semaphores
*
semaphores
=
reinterpret_cast
<
int
*>
(
*
db_scratch
+
get_reduction_scratch_space
(
batch_size
,
num_layers
,
output_features
));
return
;
}
// Does a simple MLP fprop (GEMM+bias+ReLU).
// Can handle num_layers number of layers, each with its own shape. Output of layer i is assumed
// to be input of layer i+1. output_features, WPtr and BPtr are arrays of length num_layers, and
// must be in the same order i.e. WPtr[i] and BPtr[i] are respectively the weight and bias of layer
// 'i'.
template
<
typename
T
>
int
mlp_fp
(
T
*
X
,
int
input_features
,
int
batch_size
,
T
**
WPtr
,
int
num_layers
,
int
*
output_features
,
T
**
BPtr
,
T
*
Y
,
T
*
reserved_space
,
int
use_bias
,
int
activation
)
{
T
*
weight
,
*
input
,
*
output
,
*
bias
;
T
*
reserved_space_x
,
*
reserved_space_y
;
reserved_space_x
=
NULL
;
reserved_space_y
=
reserved_space
;
// Get cublas handle from Pytorch
cublasHandle_t
handle
=
at
::
cuda
::
getCurrentCUDABlasHandle
();
// Get the stream from cublas handle to reuse for biasReLU kernel.
cudaStream_t
stream
;
cublasGetStream
(
handle
,
&
stream
);
for
(
int
layer
=
0
;
layer
<
num_layers
;
layer
++
)
{
weight
=
WPtr
[
layer
];
input
=
(
layer
==
0
)
?
X
:
reserved_space_x
;
output
=
(
layer
==
num_layers
-
1
)
?
Y
:
reserved_space_y
;
if
(
use_bias
)
{
bias
=
BPtr
[
layer
];
}
int
ifeat
=
(
layer
==
0
)
?
input_features
:
output_features
[
layer
-
1
];
int
ofeat
=
output_features
[
layer
];
float
one
=
1.
f
;
float
zero
=
0.
f
;
cublasStatus_t
cublas_status
;
// Call GEMM: fprop is Y = W'X
cublas_status
=
mlp_gemm
(
handle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
ofeat
,
batch_size
,
ifeat
,
&
one
,
weight
,
ifeat
,
input
,
ifeat
,
&
zero
,
output
,
ofeat
);
if
(
cublas_status
!=
CUBLAS_STATUS_SUCCESS
)
{
printf
(
"GEMM fprop failed with %d
\n
"
,
cublas_status
);
return
1
;
}
const
uint
&
input_size
=
ofeat
;
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
// Call biasReLU
if
(
use_bias
==
1
)
{
if
(
activation
==
0
)
{
// no activation
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAdd_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAdd_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
}
else
if
(
activation
==
1
)
{
// relu
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAddRelu_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAddRelu_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
}
else
if
(
activation
==
2
)
{
// sigmoid
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAdd_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAdd_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
}
else
{
// don't need to do anything in case of no activation and no bias
if
(
activation
==
1
)
{
// relu
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Relu_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Relu_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
else
if
(
activation
==
2
)
{
// sigmoid
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
}
// Set current output as next layer input
reserved_space_x
=
reserved_space_y
;
// Set next layer output
reserved_space_y
+=
ofeat
*
batch_size
;
}
return
0
;
}
// Does a simple MLP bprop (GEMM+bias+ReLU).
// Needs reserved space to come back exactly as it was populated in fprop.
// Does dgrad and wgrad sequentially.
template
<
typename
T
>
int
mlp_bp
(
T
*
X
,
T
*
Y
,
int
input_features
,
int
batch_size
,
T
**
WPtr
,
int
num_layers
,
int
*
output_features
,
T
*
dY
,
T
*
reserved_space
,
T
*
work_space
,
T
*
dX
,
T
**
dwPtr
,
T
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
)
{
T
*
weight
;
T
*
dweight
,
*
dx
,
*
dy
,
*
dbias
;
T
*
x
,
*
y
;
// Where the dx of the biasReLU (== dy of gemm) is stored. Can be thrown away
// after bp call.
T
*
dy_gemm_base
;
// Where the dx after GEMM is stored.
T
*
dx_gemm_base
;
// Where partial reduction results are stored.
float
*
db_scratch
;
// Semaphores for reduction.
int
*
semaphores
;
partition_mlp_bp_workspace
<
T
>
(
batch_size
,
num_layers
,
output_features
,
work_space
,
&
dy_gemm_base
,
&
dx_gemm_base
,
&
db_scratch
,
&
semaphores
);
size_t
semaphore_size
=
get_semaphores_size
(
num_layers
,
output_features
)
*
sizeof
(
int
);
// Get cublas handle from Pytorch
cublasHandle_t
handle
=
at
::
cuda
::
getCurrentCUDABlasHandle
();
// Get the stream from cublas handle to reuse for biasReLU kernel.
cudaStream_t
stream
;
cublasGetStream
(
handle
,
&
stream
);
int
*
y_offsets
=
(
int
*
)
malloc
(
num_layers
*
sizeof
(
int
));
get_y_offsets
(
batch_size
,
num_layers
,
output_features
,
y_offsets
);
for
(
int
layer
=
num_layers
-
1
;
layer
>=
0
;
layer
--
)
{
weight
=
WPtr
[
layer
];
dweight
=
dwPtr
[
layer
];
// x is read from reserved space
x
=
(
layer
==
0
)
?
X
:
reserved_space
+
y_offsets
[
layer
-
1
];
// dx is written in workspace for all but layer==0
dx
=
(
layer
==
0
)
?
dX
:
dx_gemm_base
+
y_offsets
[
layer
-
1
];
// y is read from reserved space
y
=
(
layer
==
num_layers
-
1
)
?
Y
:
reserved_space
+
y_offsets
[
layer
];
// dx from layer+1
dy
=
(
layer
==
num_layers
-
1
)
?
dY
:
dx_gemm_base
+
y_offsets
[
layer
];
// dy_gemm is written to and read immediately
T
*
dy_gemm
=
dy_gemm_base
+
y_offsets
[
layer
];
dbias
=
dbPtr
[
layer
];
int
xfeat
=
(
layer
==
0
)
?
input_features
:
output_features
[
layer
-
1
];
int
yfeat
=
output_features
[
layer
];
float
one
=
1.
f
;
float
zero
=
0.
f
;
if
(
use_bias
==
1
)
{
if
(
activation
==
0
)
{
// no acitvation
// bgrad
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAdd_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
dy
,
yfeat
,
batch_size
,
db_scratch
,
semaphores
,
dbias
);
// bypass dgrad through reset pointer
dy_gemm
=
dy
;
}
else
if
(
activation
==
1
)
{
// relu
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
if
(
yfeat
%
(
ILP
*
BIAS_RELU_BW_NTHREADS_X
)
==
0
&&
is_aligned
(
y
)
&&
is_aligned
(
dy
)
&&
is_aligned
(
dy_gemm
)
&&
is_aligned
(
dbias
)){
int
block_x
=
ILP
*
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAddRelu_bprop_aligned
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
}
else
{
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAddRelu_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
}
}
else
if
(
activation
==
2
)
{
// sigmoid
// activation backward
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
// bgrad, from dy_gemm
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAdd_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
dy_gemm
,
yfeat
,
batch_size
,
db_scratch
,
semaphores
,
dbias
);
}
}
else
{
// no bias below
if
(
activation
==
0
)
{
// bypass dgrad through reset pointer
dy_gemm
=
dy
;
}
else
if
(
activation
==
1
)
{
// relu
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Relu_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Relu_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
}
else
if
(
activation
==
2
)
{
// sigmoid
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
}
}
cublasStatus_t
cublas_status
;
// Call GEMM dgrad
if
(
layer
>
0
||
requires_grad
==
1
)
{
cublas_status
=
mlp_gemm
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
xfeat
,
batch_size
,
yfeat
,
&
one
,
weight
,
xfeat
,
dy_gemm
,
yfeat
,
&
zero
,
dx
,
xfeat
);
if
(
cublas_status
!=
CUBLAS_STATUS_SUCCESS
)
{
printf
(
"GEMM dgrad failed with %d
\n
"
,
cublas_status
);
return
1
;
}
}
// Call GEMM wgrad
cublas_status
=
mlp_gemm
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
xfeat
,
yfeat
,
batch_size
,
&
one
,
x
,
xfeat
,
dy_gemm
,
yfeat
,
&
zero
,
dweight
,
xfeat
);
if
(
cublas_status
!=
CUBLAS_STATUS_SUCCESS
)
{
printf
(
"GEMM wgrad failed with %d
\n
"
,
cublas_status
);
return
1
;
}
}
return
0
;
}
// Instantiate for floating point types
template
int
mlp_fp
<
float
>(
float
*
X
,
int
input_features
,
int
batch_size
,
float
**
WPtr
,
int
num_layers
,
int
*
output_features
,
float
**
BPtr
,
float
*
Y
,
float
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
float
>(
float
*
X
,
float
*
Y
,
int
input_features
,
int
batch_size
,
float
**
WPtr
,
int
num_layers
,
int
*
output_features
,
float
*
dY
,
float
*
reserved_space
,
float
*
work_space
,
float
*
dX
,
float
**
dwPtr
,
float
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
int
mlp_fp
<
at
::
Half
>(
at
::
Half
*
X
,
int
input_features
,
int
batch_size
,
at
::
Half
**
WPtr
,
int
num_layers
,
int
*
output_features
,
at
::
Half
**
BPtr
,
at
::
Half
*
Y
,
at
::
Half
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
at
::
Half
>(
at
::
Half
*
X
,
at
::
Half
*
Y
,
int
input_features
,
int
batch_size
,
at
::
Half
**
WPtr
,
int
num_layers
,
int
*
output_features
,
at
::
Half
*
dY
,
at
::
Half
*
reserved_space
,
at
::
Half
*
work_space
,
at
::
Half
*
dX
,
at
::
Half
**
dwPtr
,
at
::
Half
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
int
mlp_fp
<
double
>(
double
*
X
,
int
input_features
,
int
batch_size
,
double
**
WPtr
,
int
num_layers
,
int
*
output_features
,
double
**
BPtr
,
double
*
Y
,
double
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
double
>(
double
*
X
,
double
*
Y
,
int
input_features
,
int
batch_size
,
double
**
WPtr
,
int
num_layers
,
int
*
output_features
,
double
*
dY
,
double
*
reserved_space
,
double
*
work_space
,
double
*
dX
,
double
**
dwPtr
,
double
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
size_t
get_mlp_bp_workspace_in_bytes
<
float
>(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
);
template
size_t
get_mlp_bp_workspace_in_bytes
<
at
::
Half
>(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
);
template
size_t
get_mlp_bp_workspace_in_bytes
<
double
>(
int
batch_size
,
int
num_layers
,
const
int
*
output_features
);
csrc/multi_tensor_axpby_kernel.cu
View file @
0bfb8300
...
@@ -13,6 +13,17 @@
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define BLOCK_SIZE 512
#define ILP 4
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
,
typename
y_t
,
typename
out_t
>
template
<
typename
x_t
,
typename
y_t
,
typename
out_t
>
struct
AxpbyFunctor
struct
AxpbyFunctor
{
{
...
@@ -43,46 +54,74 @@ struct AxpbyFunctor
...
@@ -43,46 +54,74 @@ struct AxpbyFunctor
n
-=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
bool
finite
=
true
;
x_t
r_x
[
ILP
];
y_t
r_y
[
ILP
];
out_t
r_out
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
)
&&
is_aligned
(
y
)
&&
is_aligned
(
out
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
load_store
(
r_y
,
y
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
if
(
arg_to_check
==
-
1
)
finite
=
finite
&&
(
isfinite
(
r_x
[
ii
])
&&
isfinite
(
r_y
[
ii
]));
if
(
arg_to_check
==
0
)
finite
=
finite
&&
isfinite
(
r_x
[
ii
]);
if
(
arg_to_check
==
1
)
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
}
// store
load_store
(
out
,
r_out
,
i_start
,
0
);
}
}
else
{
// Non-divergent exit condition for __syncthreads, not necessary here
// Non-divergent exit condition for __syncthreads, not necessary here
float
xs
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
float
ys
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
{
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
x
s
[
ii
]
=
0
;
r_
x
[
ii
]
=
0
;
y
s
[
ii
]
=
0
;
r_
y
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
if
(
i
<
n
&&
i
<
chunk_size
)
{
{
x
s
[
ii
]
=
static_cast
<
float
>
(
x
[
i
]
)
;
r_
x
[
ii
]
=
x
[
i
];
y
s
[
ii
]
=
static_cast
<
float
>
(
y
[
i
]
)
;
r_
y
[
ii
]
=
y
[
i
];
}
}
}
}
#pragma unroll
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
if
(
i
<
n
&&
i
<
chunk_size
)
{
out
[
i
]
=
static_cast
<
out_t
>
(
a
*
xs
[
ii
]
+
b
*
ys
[
ii
]);
bool
finite
=
true
;
if
(
arg_to_check
==
-
1
)
if
(
arg_to_check
==
-
1
)
finite
=
(
isfinite
(
x
s
[
ii
])
&&
isfinite
(
y
s
[
ii
]));
finite
=
finite
&&
(
isfinite
(
r_
x
[
ii
])
&&
isfinite
(
r_
y
[
ii
]));
if
(
arg_to_check
==
0
)
if
(
arg_to_check
==
0
)
finite
=
isfinite
(
x
s
[
ii
]);
finite
=
finite
&&
isfinite
(
r_
x
[
ii
]);
if
(
arg_to_check
==
1
)
if
(
arg_to_check
==
1
)
finite
=
isfinite
(
ys
[
ii
]);
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
if
(
!
finite
)
}
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
}
}
}
}
}
}
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
}
};
};
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment