Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
872aa413
Commit
872aa413
authored
May 13, 2022
by
BoxiangW
Committed by
binmakeswell
May 17, 2022
Browse files
[NFC] Polish colossalai/kernel/cuda_native/csrc/multi_tensor_lamb.cu code style. (#937)
parent
58580b50
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
25 additions
and
25 deletions
+25
-25
colossalai/kernel/cuda_native/csrc/multi_tensor_lamb.cu
colossalai/kernel/cuda_native/csrc/multi_tensor_lamb.cu
+25
-25
No files found.
colossalai/kernel/cuda_native/csrc/multi_tensor_lamb.cu
View file @
872aa413
...
...
@@ -15,7 +15,8 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
)
{
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
)
{
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
...
...
@@ -28,24 +29,25 @@ __device__ __forceinline__ void load_store(T *dst, T *src, int dst_offset,
}
typedef
enum
{
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
}
adamMode_t
;
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
multi_tensor_l2norm_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
optional
<
bool
>
per_tensor_python
);
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
multi_tensor_l2norm_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
optional
<
bool
>
per_tensor_python
);
using
MATH_T
=
float
;
template
<
typename
T
>
struct
LAMBStage1Functor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
4
>
&
tl
,
const
float
beta1
,
const
float
beta2
,
const
float
beta3
,
const
float
beta1_correction
,
const
float
beta2_correction
,
const
float
epsilon
,
adamMode_t
mode
,
const
float
decay
,
const
float
*
global_grad_norm
,
const
float
max_global_grad_norm
)
{
template
<
typename
T
>
struct
LAMBStage1Functor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
4
>
&
tl
,
const
float
beta1
,
const
float
beta2
,
const
float
beta3
,
const
float
beta1_correction
,
const
float
beta2_correction
,
const
float
epsilon
,
adamMode_t
mode
,
const
float
decay
,
const
float
*
global_grad_norm
,
const
float
max_global_grad_norm
)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
...
...
@@ -89,8 +91,7 @@ template <typename T> struct LAMBStage1Functor {
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
l_g
,
g
,
0
,
i_start
);
if
(
decay
!=
0
)
load_store
(
l_p
,
p
,
0
,
i_start
);
if
(
decay
!=
0
)
load_store
(
l_p
,
p
,
0
,
i_start
);
load_store
(
l_m
,
m
,
0
,
i_start
);
load_store
(
l_v
,
v
,
0
,
i_start
);
// unpack
...
...
@@ -204,12 +205,12 @@ template <typename T> struct LAMBStage1Functor {
// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
template
<
typename
T
>
struct
LAMBStage2Functor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
2
>
&
tl
,
const
float
*
per_tensor_param_norm
,
const
float
*
per_tensor_update_norm
,
const
float
learning_rate
,
const
float
decay
,
bool
use_nvlamb
)
{
template
<
typename
T
>
struct
LAMBStage2Functor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
2
>
&
tl
,
const
float
*
per_tensor_param_norm
,
const
float
*
per_tensor_update_norm
,
const
float
learning_rate
,
const
float
decay
,
bool
use_nvlamb
)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
...
...
@@ -310,8 +311,7 @@ void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag,
// Handle grad averaging mode
float
beta3
=
1.0
f
;
if
(
grad_averaging
==
1
)
beta3
=
1
-
beta1
;
if
(
grad_averaging
==
1
)
beta3
=
1
-
beta1
;
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
grad_list
(
tensor_lists
.
begin
(),
tensor_lists
.
begin
()
+
1
);
...
...
@@ -330,7 +330,7 @@ void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag,
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"lamb_stage_1"
,
multi_tensor_apply
<
4
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
LAMBStage1Functor
<
scalar_t_0
>
(),
beta1
,
beta2
,
beta3
,
// 1-beta1 or 1 depends on averaging mode
beta3
,
// 1-beta1 or 1 depends on averaging mode
bias_correction1
,
bias_correction2
,
epsilon
,
(
adamMode_t
)
mode
,
weight_decay
,
global_grad_norm
.
DATA_PTR
<
float
>
(),
max_grad_norm
);)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment