Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
6e14df49
You need to sign in or sign up before continuing.
Commit
6e14df49
authored
May 08, 2020
by
rohithkrn
Browse files
Merge remote-tracking branch 'rocm_up/master' into apex_amp_bfp16
parents
c7fd532c
2d0f9cf2
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1438 additions
and
376 deletions
+1438
-376
apex/amp/scaler.py
apex/amp/scaler.py
+12
-3
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
+82
-31
apex/contrib/csrc/xentropy/xentropy_kernel.cu
apex/contrib/csrc/xentropy/xentropy_kernel.cu
+178
-67
apex/contrib/multihead_attn/self_multihead_attn_func.py
apex/contrib/multihead_attn/self_multihead_attn_func.py
+1
-1
apex/mlp/mlp.py
apex/mlp/mlp.py
+28
-19
csrc/layer_norm_cuda_kernel.cu
csrc/layer_norm_cuda_kernel.cu
+10
-4
csrc/mlp.cpp
csrc/mlp.cpp
+41
-16
csrc/mlp_cuda.cu
csrc/mlp_cuda.cu
+593
-100
csrc/multi_tensor_axpby_kernel.cu
csrc/multi_tensor_axpby_kernel.cu
+66
-27
csrc/multi_tensor_l2norm_kernel.cu
csrc/multi_tensor_l2norm_kernel.cu
+67
-12
csrc/multi_tensor_lamb.cu
csrc/multi_tensor_lamb.cu
+183
-70
csrc/multi_tensor_scale_kernel.cu
csrc/multi_tensor_scale_kernel.cu
+60
-25
setup.py
setup.py
+7
-1
tests/L0/run_mlp/test_mlp.py
tests/L0/run_mlp/test_mlp.py
+110
-0
No files found.
apex/amp/scaler.py
View file @
6e14df49
...
@@ -6,12 +6,18 @@ from itertools import product
...
@@ -6,12 +6,18 @@ from itertools import product
def
scale_check_overflow_python
(
model_grad
,
master_grad
,
scale
,
check_overflow
=
False
):
def
scale_check_overflow_python
(
model_grad
,
master_grad
,
scale
,
check_overflow
=
False
):
# Exception handling for 18.04 compatibility
# Exception handling for 18.04 compatibility
if
check_overflow
:
if
check_overflow
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
model_grad
.
is_sparse
:
cpu_sum
=
float
(
model_grad
.
float
().
_values
().
sum
())
else
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
return
True
return
True
if
master_grad
is
not
model_grad
:
# copy_ probably internally short-circuits this
if
master_grad
is
not
model_grad
:
# copy_ probably internally short-circuits this
master_grad
.
copy_
(
model_grad
)
if
model_grad
.
is_sparse
:
master_grad
.
copy_
(
model_grad
.
to_dense
())
else
:
master_grad
.
copy_
(
model_grad
)
if
scale
!=
1.0
:
if
scale
!=
1.0
:
master_grad
.
mul_
(
scale
)
master_grad
.
mul_
(
scale
)
return
False
return
False
...
@@ -19,7 +25,10 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
...
@@ -19,7 +25,10 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
def
axpby_check_overflow_python
(
model_grad
,
stashed_grad
,
master_grad
,
a
,
b
,
check_overflow
=
False
):
def
axpby_check_overflow_python
(
model_grad
,
stashed_grad
,
master_grad
,
a
,
b
,
check_overflow
=
False
):
# Exception handling for 18.04 compatibility
# Exception handling for 18.04 compatibility
if
check_overflow
:
if
check_overflow
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
model_grad
.
is_sparse
:
cpu_sum
=
float
(
model_grad
.
float
().
_values
().
sum
())
else
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
return
True
return
True
...
...
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
View file @
6e14df49
...
@@ -14,6 +14,17 @@
...
@@ -14,6 +14,17 @@
#define BLOCK_SIZE 512
#define BLOCK_SIZE 512
#define ILP 4
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
#include "type_shim.h"
#include "type_shim.h"
typedef
enum
{
typedef
enum
{
...
@@ -99,24 +110,64 @@ struct AdamFunctor
...
@@ -99,24 +110,64 @@ struct AdamFunctor
T
incoming_v
[
ILP
];
T
incoming_v
[
ILP
];
T
incoming_g
[
ILP
];
T
incoming_g
[
ILP
];
for
(
int
i_start
=
0
;
// to make things simple, we put aligned case in a different code path
i_start
<
n
&&
i_start
<
chunk_size
;
if
(
n
%
ILP
==
0
&&
i_start
+=
blockDim
.
x
*
ILP
)
{
chunk_size
%
ILP
==
0
&&
is_aligned
(
p
)
&&
is_aligned
(
m
)
&&
is_aligned
(
v
)
&&
is_aligned
(
g
)
&&
is_aligned
(
p_copy
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
GRAD_T
tmp_g
[
ILP
];
load_store
(
incoming_p
,
p
,
0
,
i_start
);
load_store
(
incoming_m
,
m
,
0
,
i_start
);
load_store
(
incoming_v
,
v
,
0
,
i_start
);
load_store
(
tmp_g
,
g
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
incoming_g
[
ii
]
=
static_cast
<
T
>
(
tmp_g
[
ii
]);
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
incoming_m
[
ii
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
incoming_v
[
ii
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
incoming_v
[
ii
]
+
eps
);
else
// Mode 1
denom
=
sqrtf
(
incoming_v
[
ii
])
+
eps
;
float
update
=
(
incoming_m
[
ii
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
incoming_p
[
ii
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
tmp_g
[
ii
]
=
static_cast
<
GRAD_T
>
(
incoming_p
[
ii
]);
}
load_store
(
p
,
incoming_p
,
i_start
,
0
);
load_store
(
m
,
incoming_m
,
i_start
,
0
);
load_store
(
v
,
incoming_v
,
i_start
,
0
);
if
(
DEPTH
==
5
)
load_store
(
p_copy
,
tmp_g
,
i_start
,
0
);
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
incoming_p
[
ii
]
=
0
;
incoming_p
[
ii
]
=
0
;
incoming_m
[
ii
]
=
0
;
incoming_m
[
ii
]
=
0
;
incoming_v
[
ii
]
=
0
;
incoming_v
[
ii
]
=
0
;
incoming_g
[
ii
]
=
0
;
incoming_g
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
if
(
i
<
n
&&
i
<
chunk_size
)
{
incoming_p
[
ii
]
=
p
[
i
];
incoming_p
[
ii
]
=
p
[
i
];
incoming_m
[
ii
]
=
m
[
i
];
incoming_m
[
ii
]
=
m
[
i
];
incoming_v
[
ii
]
=
v
[
i
];
incoming_v
[
ii
]
=
v
[
i
];
incoming_g
[
ii
]
=
static_cast
<
T
>
(
g
[
i
]);
incoming_g
[
ii
]
=
static_cast
<
T
>
(
g
[
i
]);
}
}
}
}
// note for clarification to future michael:
// note for clarification to future michael:
...
@@ -124,24 +175,25 @@ struct AdamFunctor
...
@@ -124,24 +175,25 @@ struct AdamFunctor
// the write loop, since writes just fire off once their LDGs arrive.
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
j
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
int
j
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
j
<
n
&&
j
<
chunk_size
)
{
if
(
j
<
n
&&
j
<
chunk_size
)
{
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
m
[
j
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
m
[
j
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
v
[
j
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
v
[
j
]
+
eps
);
denom
=
sqrtf
(
v
[
j
]
+
eps
);
else
// Mode 1
else
// Mode 1
denom
=
sqrtf
(
v
[
j
])
+
eps
;
denom
=
sqrtf
(
v
[
j
])
+
eps
;
float
update
=
(
m
[
j
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
float
update
=
(
m
[
j
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
p
[
j
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
p
[
j
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
p_copy
[
j
]
=
(
GRAD_T
)
p
[
j
];
if
(
DEPTH
==
5
)
p_copy
[
j
]
=
(
GRAD_T
)
p
[
j
];
}
}
}
}
}
}
}
}
}
};
};
...
@@ -332,4 +384,3 @@ void fused_adam_cuda_mt(
...
@@ -332,4 +384,3 @@ void fused_adam_cuda_mt(
}
}
THCudaCheck
(
cudaGetLastError
());
THCudaCheck
(
cudaGetLastError
());
}
}
apex/contrib/csrc/xentropy/xentropy_kernel.cu
View file @
6e14df49
/**
/**
* From PyTorch:
* From PyTorch:
*
*
* Copyright (c) 2016- Facebook, Inc (Adam Paszke)
* Copyright (c) 2016- Facebook, Inc (Adam Paszke)
* Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
* Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
...
@@ -10,54 +10,54 @@
...
@@ -10,54 +10,54 @@
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
*
*
* From Caffe2:
* From Caffe2:
*
*
* Copyright (c) 2016-present, Facebook Inc. All rights reserved.
* Copyright (c) 2016-present, Facebook Inc. All rights reserved.
*
*
* All contributions by Facebook:
* All contributions by Facebook:
* Copyright (c) 2016 Facebook Inc.
* Copyright (c) 2016 Facebook Inc.
*
*
* All contributions by Google:
* All contributions by Google:
* Copyright (c) 2015 Google Inc.
* Copyright (c) 2015 Google Inc.
* All rights reserved.
* All rights reserved.
*
*
* All contributions by Yangqing Jia:
* All contributions by Yangqing Jia:
* Copyright (c) 2015 Yangqing Jia
* Copyright (c) 2015 Yangqing Jia
* All rights reserved.
* All rights reserved.
*
*
* All contributions from Caffe:
* All contributions from Caffe:
* Copyright(c) 2013, 2014, 2015, the respective contributors
* Copyright(c) 2013, 2014, 2015, the respective contributors
* All rights reserved.
* All rights reserved.
*
*
* All other contributions:
* All other contributions:
* Copyright(c) 2015, 2016 the respective contributors
* Copyright(c) 2015, 2016 the respective contributors
* All rights reserved.
* All rights reserved.
*
*
* Caffe2 uses a copyright model similar to Caffe: each contributor holds
* Caffe2 uses a copyright model similar to Caffe: each contributor holds
* copyright over their contributions to Caffe2. The project versioning records
* copyright over their contributions to Caffe2. The project versioning records
* all such contribution and copyright details. If a contributor wants to further
* all such contribution and copyright details. If a contributor wants to further
* mark their specific copyright on a particular contribution, they should
* mark their specific copyright on a particular contribution, they should
* indicate their copyright solely in the commit message of the change when it is
* indicate their copyright solely in the commit message of the change when it is
* committed.
* committed.
*
*
* All rights reserved.
* All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* modification, are permitted provided that the following conditions are met:
*
*
* 1. Redistributions of source code must retain the above copyright
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* notice, this list of conditions and the following disclaimer.
*
*
* 2. Redistributions in binary form must reproduce the above copyright
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* documentation and/or other materials provided with the distribution.
*
*
* 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
* 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
* and IDIAP Research Institute nor the names of its contributors may be
* and IDIAP Research Institute nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* used to endorse or promote products derived from this software without
* specific prior written permission.
* specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
...
@@ -70,7 +70,6 @@
...
@@ -70,7 +70,6 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* POSSIBILITY OF SUCH DAMAGE.
*/
*/
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
...
@@ -84,6 +83,8 @@
...
@@ -84,6 +83,8 @@
#include "type_shim.h"
#include "type_shim.h"
#include "compat.h"
#include "compat.h"
#define ALIGN_BYTES 16
using
Tensor
=
at
::
Tensor
;
using
Tensor
=
at
::
Tensor
;
using
TensorList
=
at
::
TensorList
;
using
TensorList
=
at
::
TensorList
;
using
ScalarType
=
at
::
ScalarType
;
using
ScalarType
=
at
::
ScalarType
;
...
@@ -123,7 +124,7 @@ const int max_threads = 1024;
...
@@ -123,7 +124,7 @@ const int max_threads = 1024;
inline
dim3
SoftMax_getBlockSize
(
int
ILP
,
uint64_t
dim_size
)
{
inline
dim3
SoftMax_getBlockSize
(
int
ILP
,
uint64_t
dim_size
)
{
uint64_t
block_size
=
1
;
uint64_t
block_size
=
1
;
uint64_t
max_block_size
=
std
::
min
(
dim_size
/
ILP
,
static_cast
<
uint64_t
>
(
max_threads
));
uint64_t
max_block_size
=
std
::
min
(
dim_size
/
ILP
,
static_cast
<
uint64_t
>
(
max_threads
));
while
(
block_size
<
max_block_size
)
block_size
*=
2
;
while
(
block_size
<
(
max_block_size
/
2
)
)
block_size
*=
2
;
// Launch at least a single warp - the kernel assumes that.
// Launch at least a single warp - the kernel assumes that.
block_size
=
std
::
max
(
block_size
,
static_cast
<
uint64_t
>
(
32
));
block_size
=
std
::
max
(
block_size
,
static_cast
<
uint64_t
>
(
32
));
return
dim3
(
block_size
);
return
dim3
(
block_size
);
...
@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
...
@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
template
<
template
<
typename
,
typename
>
class
Reduction
,
int
ILP
,
typename
T
,
typename
AccumT
>
template
<
template
<
typename
,
typename
>
class
Reduction
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
AccumT
__device__
__forceinline__
AccumT
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
int
size
,
const
Reduction
<
T
,
AccumT
>&
r
,
const
Reduction
<
T
,
AccumT
>&
r
,
AccumT
defaultVal
)
AccumT
defaultVal
)
{
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal
=
defaultVal
;
AccumT
threadVal
=
defaultVal
;
int
offset
=
threadIdx
.
x
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal
=
r
(
threadVal
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
T
v
[
ILP
];
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
T
tmp
[
ILP
];
#pragma unroll
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
threadVal
=
r
(
threadVal
,
v
[
j
]);
threadVal
=
r
(
threadVal
,
tmp
[
j
]);
}
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
threadVal
=
r
(
threadVal
,
data
[
offset
]);
threadVal
=
r
(
threadVal
,
data
[
offset
]);
...
@@ -319,7 +331,8 @@ ilpReduce(T* data,
...
@@ -319,7 +331,8 @@ ilpReduce(T* data,
template
<
template
<
typename
,
typename
>
class
Reduction1
,
template
<
typename
,
typename
>
class
Reduction2
,
int
ILP
,
typename
T
,
typename
AccumT
>
template
<
template
<
typename
,
typename
>
class
Reduction1
,
template
<
typename
,
typename
>
class
Reduction2
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
void
__device__
__forceinline__
void
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
int
size
,
AccumT
*
reducVal1
,
AccumT
*
reducVal1
,
const
Reduction1
<
T
,
AccumT
>&
r1
,
const
Reduction1
<
T
,
AccumT
>&
r1
,
...
@@ -328,27 +341,38 @@ ilpReduce(T* data,
...
@@ -328,27 +341,38 @@ ilpReduce(T* data,
const
Reduction2
<
T
,
AccumT
>&
r2
,
const
Reduction2
<
T
,
AccumT
>&
r2
,
AccumT
defaultVal2
)
AccumT
defaultVal2
)
{
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal1
=
defaultVal1
;
AccumT
threadVal1
=
defaultVal1
;
AccumT
threadVal2
=
defaultVal2
;
AccumT
threadVal2
=
defaultVal2
;
int
offset
=
threadIdx
.
x
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
threadVal2
=
r2
(
threadVal2
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
T
v
[
ILP
];
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
T
tmp
[
ILP
];
#pragma unroll
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
threadVal1
=
r1
(
threadVal1
,
tmp
[
j
]);
threadVal1
=
r1
(
threadVal1
,
v
[
j
]);
threadVal2
=
r2
(
threadVal2
,
tmp
[
j
]);
threadVal2
=
r2
(
threadVal2
,
v
[
j
]);
}
}
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
{
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
{
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
...
@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
...
@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
// each block handles a sample in the mini-batch
// each block handles a sample in the mini-batch
input
+=
blockIdx
.
x
*
classes
;
input
+=
blockIdx
.
x
*
classes
;
//output += blockIdx.x * classes;
//output += blockIdx.x * classes;
const
int
shift
=
((
uint64_t
)
input
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
int64_t
label
=
labels
[
blockIdx
.
x
];
int64_t
label
=
labels
[
blockIdx
.
x
];
// find the max and sum
// find the max and sum
accscalar_t
threadMax
,
threadSum
,
max_k
,
sum_k
;
accscalar_t
threadMax
,
threadSum
,
max_k
,
sum_k
;
ilpReduce
<
MaxFloat
,
AddFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
ilpReduce
<
MaxFloat
,
AddFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
shift
,
input
,
classes
,
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
static_cast
<
accscalar_t
>
(
0
));
blockReduce
<
Max
,
Add
,
accscalar_t
>
(
blockReduce
<
Max
,
Add
,
accscalar_t
>
(
sdata
,
sdata
,
&
max_k
,
threadMax
,
Max
<
accscalar_t
>
(),
&
max_k
,
threadMax
,
Max
<
accscalar_t
>
(),
...
@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
...
@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
&
sum_k
,
threadSum
,
Add
<
accscalar_t
>
(),
&
sum_k
,
threadSum
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
static_cast
<
accscalar_t
>
(
0
));
// reduce all values
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
shift
,
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
sumAll
=
blockReduce
<
Add
,
accscalar_t
>
(
accscalar_t
sumAll
=
blockReduce
<
Add
,
accscalar_t
>
(
sdata
,
threadExp
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
sdata
,
threadExp
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
...
@@ -411,20 +435,16 @@ cunn_SoftMaxXEntropyForward(
...
@@ -411,20 +435,16 @@ cunn_SoftMaxXEntropyForward(
}
}
}
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__global__
void
__device__
__forceinline__
void
cunn_SoftMaxXEntropyBackward
(
apply
(
scalar_t
*
gradInput
,
scalar_t
*
gradInput
,
scalar_t
*
logits
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
int64_t
*
labels
,
const
float
smoothing
,
const
float
smoothing
,
int
classes
)
int
classes
)
{
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
...
@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
...
@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
int
offset
=
threadIdx
.
x
;
int
offset
=
threadIdx
.
x
;
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
for
(;
offset
<
classes
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
for
(;
offset
<
classes
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
accscalar_t
tmpLogits
[
ILP
];
accscalar_t
tmpLogits
[
ILP
];
...
@@ -444,22 +465,112 @@ cunn_SoftMaxXEntropyBackward(
...
@@ -444,22 +465,112 @@ cunn_SoftMaxXEntropyBackward(
#pragma unroll
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
gradInput
[
offset
+
j
*
blockDim
.
x
]
=
tmpGradOutput
*
(
gradInput
[
offset
+
j
*
blockDim
.
x
]
=
tmpGradOutput
*
(
std
::
exp
(
tmpLogits
[
j
]
-
coeff
)
-
static_cast
<
accscalar_t
>
(
std
::
exp
(
tmpLogits
[
j
]
-
coeff
)
-
static_cast
<
accscalar_t
>
(
(
offset
+
j
*
blockDim
.
x
==
label
)
?
1
:
0
)
*
(
offset
+
j
*
blockDim
.
x
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
smooth_positives
-
smooth_negatives
);
}
}
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
((
offset
==
label
)
?
1
:
0
)
*
static_cast
<
accscalar_t
>
((
offset
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
smooth_positives
-
smooth_negatives
);
}
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__device__
__forceinline__
void
aligned_apply
(
int
shift
,
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
int64_t
label
=
labels
[
blockIdx
.
x
];
accscalar_t
coeff
=
max_log_sum_exp
[
blockIdx
.
x
];
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
logits
-=
shift
;
gradInput
-=
shift
;
classes
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
classes
-=
blockDim
.
x
;
gradInput
+=
blockDim
.
x
;
logits
+=
blockDim
.
x
;
shift
-=
blockDim
.
x
;
}
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
scalar_t
),
ILP
*
alignof
(
scalar_t
)
>::
type
LoadT
;
// input
scalar_t
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
// output
scalar_t
r
[
ILP
];
LoadT
*
result
=
reinterpret_cast
<
LoadT
*>
(
&
r
);
for
(;
offset
*
ILP
<
(
classes
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
logits
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
r
[
j
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
v
[
j
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
ILP
*
offset
+
j
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
reinterpret_cast
<
LoadT
*>
(
gradInput
)[
offset
]
=
*
result
;
}
offset
=
classes
-
last
+
threadIdx
.
x
;
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__global__
void
cunn_SoftMaxXEntropyBackward
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
// Do vectorized load/store when input/output have same alignment
const
int
shift
=
((
uint64_t
)
logits
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
const
int
shift_
=
((
uint64_t
)
gradInput
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
if
(
shift
==
shift_
){
aligned_apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
shift
,
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
else
{
apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
}
template
<
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
template
<
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
std
::
vector
<
Tensor
>
host_softmax_xentropy
(
std
::
vector
<
Tensor
>
host_softmax_xentropy
(
...
@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
...
@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
// XXX: it assumes that inner_size == 1
// XXX: it assumes that inner_size == 1
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
using
namespace
at
;
using
namespace
at
;
DISPATCH_FLOAT_AND_HALF
(
input
.
scalar_type
(),
0
,
"host_softmax_xentropy"
,
DISPATCH_FLOAT_AND_HALF
(
input
.
scalar_type
(),
0
,
"host_softmax_xentropy"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyForward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
cunn_SoftMaxXEntropyForward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
2
*
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
<<<
grid
,
block
,
2
*
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
...
@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
DISPATCH_FLOAT_AND_HALF
(
gI
.
scalar_type
(),
0
,
"host_softmax_xentropy_backward"
,
DISPATCH_FLOAT_AND_HALF
(
gI
.
scalar_type
(),
0
,
"host_softmax_xentropy_backward"
,
using
accscalar_t
=
acc_type
<
scalar_t_0
,
true
>
;
using
accscalar_t
=
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyBackward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
cunn_SoftMaxXEntropyBackward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
<<<
grid
,
block
,
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
...
apex/contrib/multihead_attn/self_multihead_attn_func.py
View file @
6e14df49
...
@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
...
@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
values_grads
=
torch
.
bmm
(
dropout_results
.
transpose
(
1
,
2
),
output_lin_grads
,
out
=
values_grads
.
transpose
(
0
,
1
))
values_grads
=
torch
.
bmm
(
dropout_results
.
transpose
(
1
,
2
),
output_lin_grads
,
out
=
values_grads
.
transpose
(
0
,
1
))
# Mask and Scaling for Dropout (not a publically documented op)
# Mask and Scaling for Dropout (not a publically documented op)
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
dropout_prob_t
[
0
])
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
1.0
/
(
1.0
-
dropout_prob_t
[
0
])
)
# Softmax Grad (not a publically documented op)
# Softmax Grad (not a publically documented op)
softmax_grads
=
torch
.
_softmax_backward_data
(
dropout_grads
,
softmax_results
,
-
1
,
softmax_results
)
softmax_grads
=
torch
.
_softmax_backward_data
(
dropout_grads
,
softmax_results
,
-
1
,
softmax_results
)
...
...
apex/mlp/mlp.py
View file @
6e14df49
...
@@ -7,17 +7,19 @@ from .. import amp
...
@@ -7,17 +7,19 @@ from .. import amp
class
MlpFunction
(
torch
.
autograd
.
Function
):
class
MlpFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
*
args
):
def
forward
(
ctx
,
bias
,
activation
,
*
args
):
output
=
mlp_cuda
.
forward
(
args
)
output
=
mlp_cuda
.
forward
(
bias
,
activation
,
args
)
ctx
.
save_for_backward
(
*
args
)
ctx
.
save_for_backward
(
*
args
)
ctx
.
outputs
=
output
ctx
.
outputs
=
output
ctx
.
bias
=
bias
ctx
.
activation
=
activation
return
output
[
0
]
return
output
[
0
]
@
staticmethod
@
staticmethod
def
backward
(
ctx
,
grad_o
):
def
backward
(
ctx
,
grad_o
):
grads
=
mlp_cuda
.
backward
(
grad_o
,
ctx
.
outputs
,
ctx
.
saved_tensors
)
grads
=
mlp_cuda
.
backward
(
ctx
.
bias
,
ctx
.
activation
,
grad_o
,
ctx
.
outputs
,
ctx
.
saved_tensors
)
del
ctx
.
outputs
del
ctx
.
outputs
return
tuple
(
grads
)
return
(
None
,
None
,
*
grads
)
mlp_function
=
amp
.
half_function
(
MlpFunction
.
apply
)
mlp_function
=
amp
.
half_function
(
MlpFunction
.
apply
)
...
@@ -29,16 +31,21 @@ class MLP(torch.nn.Module):
...
@@ -29,16 +31,21 @@ class MLP(torch.nn.Module):
bias (bool): Default True:
bias (bool): Default True:
relu (bool): Default True
relu (bool): Default True
"""
"""
def
__init__
(
self
,
mlp_sizes
,
bias
=
True
,
relu
=
True
):
def
__init__
(
self
,
mlp_sizes
,
bias
=
True
,
activation
=
'relu'
):
if
not
(
bias
and
relu
):
raise
TypeError
(
"bias and relu must be both true."
)
super
(
MLP
,
self
).
__init__
()
super
(
MLP
,
self
).
__init__
()
self
.
num_layers
=
len
(
mlp_sizes
)
-
1
self
.
num_layers
=
len
(
mlp_sizes
)
-
1
self
.
mlp_sizes
=
copy
(
mlp_sizes
)
self
.
mlp_sizes
=
copy
(
mlp_sizes
)
self
.
bias
=
bias
self
.
bias
=
1
if
bias
else
0
self
.
relu
=
relu
if
activation
is
'none'
:
self
.
activation
=
0
elif
activation
is
'relu'
:
self
.
activation
=
1
elif
activation
is
'sigmoid'
:
self
.
activation
=
2
else
:
raise
TypeError
(
"activation must be relu or none."
)
# ignoring bias = False now
self
.
weights
=
[]
self
.
weights
=
[]
self
.
biases
=
[]
self
.
biases
=
[]
for
i
in
range
(
self
.
num_layers
):
for
i
in
range
(
self
.
num_layers
):
...
@@ -46,10 +53,11 @@ class MLP(torch.nn.Module):
...
@@ -46,10 +53,11 @@ class MLP(torch.nn.Module):
self
.
weights
.
append
(
w
)
self
.
weights
.
append
(
w
)
name
=
'weight_{}'
.
format
(
i
)
name
=
'weight_{}'
.
format
(
i
)
setattr
(
self
,
name
,
w
)
setattr
(
self
,
name
,
w
)
b
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
]))
if
self
.
bias
:
self
.
biases
.
append
(
b
)
b
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
]))
name
=
'bias_{}'
.
format
(
i
)
self
.
biases
.
append
(
b
)
setattr
(
self
,
name
,
b
)
name
=
'bias_{}'
.
format
(
i
)
setattr
(
self
,
name
,
b
)
self
.
reset_parameters
()
self
.
reset_parameters
()
...
@@ -58,13 +66,14 @@ class MLP(torch.nn.Module):
...
@@ -58,13 +66,14 @@ class MLP(torch.nn.Module):
dimsum
=
weight
.
size
(
0
)
+
weight
.
size
(
1
)
dimsum
=
weight
.
size
(
0
)
+
weight
.
size
(
1
)
std
=
math
.
sqrt
(
2.
/
float
(
dimsum
))
std
=
math
.
sqrt
(
2.
/
float
(
dimsum
))
nn
.
init
.
normal_
(
weight
,
0.
,
std
)
nn
.
init
.
normal_
(
weight
,
0.
,
std
)
for
bias
in
self
.
biases
:
if
self
.
bias
:
std
=
math
.
sqrt
(
1.
/
float
(
bias
.
size
(
0
)))
for
bias
in
self
.
biases
:
nn
.
init
.
normal_
(
bias
,
0.
,
std
)
std
=
math
.
sqrt
(
1.
/
float
(
bias
.
size
(
0
)))
nn
.
init
.
normal_
(
bias
,
0.
,
std
)
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
return
mlp_function
(
input
,
*
self
.
weights
,
*
self
.
biases
)
return
mlp_function
(
self
.
bias
,
self
.
activation
,
input
,
*
self
.
weights
,
*
self
.
biases
)
def
extra_repr
(
self
):
def
extra_repr
(
self
):
s
=
F
"MLP sizes:
{
self
.
mlp_sizes
}
, Bias=
{
self
.
bias
}
,
ReLU=
{
self
.
relu
}
"
s
=
F
"MLP sizes:
{
self
.
mlp_sizes
}
, Bias=
{
self
.
bias
}
,
activation=
{
self
.
activation
}
"
return
s
return
s
csrc/layer_norm_cuda_kernel.cu
View file @
6e14df49
...
@@ -172,8 +172,8 @@ void cuWelfordMuSigma2(
...
@@ -172,8 +172,8 @@ void cuWelfordMuSigma2(
for
(;
l
+
7
<
n2
;
l
+=
8
*
numx
)
{
for
(;
l
+
7
<
n2
;
l
+=
8
*
numx
)
{
for
(
int
k
=
0
;
k
<
8
;
k
+=
2
)
{
for
(
int
k
=
0
;
k
<
8
;
k
+=
2
)
{
float2
curr
=
__half22float2
(
*
((
__half2
*
)(
lvals
+
l
+
k
)));
float2
curr
=
__half22float2
(
*
((
__half2
*
)(
lvals
+
l
+
k
)));
cuWelfordOnlineSum
(
curr
.
x
,
mu
,
sigma2
,
count
);
cuWelfordOnlineSum
<
float
>
(
curr
.
x
,
mu
,
sigma2
,
count
);
cuWelfordOnlineSum
(
curr
.
y
,
mu
,
sigma2
,
count
);
cuWelfordOnlineSum
<
float
>
(
curr
.
y
,
mu
,
sigma2
,
count
);
}
}
}
}
for
(;
l
<
n2
;
++
l
)
{
for
(;
l
<
n2
;
++
l
)
{
...
@@ -230,9 +230,15 @@ void cuWelfordMuSigma2(
...
@@ -230,9 +230,15 @@ void cuWelfordMuSigma2(
template
<
typename
U
>
U
rsqrt
(
U
v
)
{
template
<
typename
U
>
U
rsqrt
(
U
v
)
{
return
U
(
1
)
/
sqrt
(
v
);
return
U
(
1
)
/
sqrt
(
v
);
}
}
#if defined __HIP_PLATFORM_HCC__
__device__
float
rsqrt
(
float
v
)
{
return
rsqrtf
(
v
);
}
#else
template
<
>
float
rsqrt
(
float
v
)
{
template
<
>
float
rsqrt
(
float
v
)
{
return
rsqrtf
(
v
);
return
rsqrtf
(
v
);
}
}
#endif
template
<
>
double
rsqrt
(
double
v
)
{
template
<
>
double
rsqrt
(
double
v
)
{
return
rsqrt
(
v
);
return
rsqrt
(
v
);
}
}
...
@@ -293,7 +299,7 @@ void cuApplyLayerNorm(
...
@@ -293,7 +299,7 @@ void cuApplyLayerNorm(
// 1) blockDim.x == warpSize
// 1) blockDim.x == warpSize
// 2) Tensors are contiguous
// 2) Tensors are contiguous
//
//
for
(
auto
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
for
(
int
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
SharedMemory
<
U
>
shared
;
SharedMemory
<
U
>
shared
;
U
*
buf
=
shared
.
getPointer
();
U
*
buf
=
shared
.
getPointer
();
U
mu
,
sigma2
;
U
mu
,
sigma2
;
...
@@ -531,7 +537,7 @@ void cuComputeGradInput(
...
@@ -531,7 +537,7 @@ void cuComputeGradInput(
const
T
*
gamma
,
const
T
*
gamma
,
T
*
grad_input
)
T
*
grad_input
)
{
{
for
(
auto
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
for
(
int
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
U
sum_loss1
=
U
(
0
);
U
sum_loss1
=
U
(
0
);
U
sum_loss2
=
U
(
0
);
U
sum_loss2
=
U
(
0
);
const
U
c_mean
=
mean
[
i1
];
const
U
c_mean
=
mean
[
i1
];
...
...
csrc/mlp.cpp
View file @
6e14df49
...
@@ -19,7 +19,9 @@ int mlp_fp(
...
@@ -19,7 +19,9 @@ int mlp_fp(
int
*
output_features
,
int
*
output_features
,
T
**
BPtr
,
T
**
BPtr
,
T
*
Y
,
T
*
Y
,
T
*
reserved_space
);
T
*
reserved_space
,
int
use_bias
,
int
activation
);
template
<
typename
T
>
template
<
typename
T
>
int
mlp_bp
(
int
mlp_bp
(
...
@@ -35,11 +37,18 @@ int mlp_bp(
...
@@ -35,11 +37,18 @@ int mlp_bp(
T
*
work_space
,
T
*
work_space
,
T
*
dX
,
T
*
dX
,
T
**
dwPtr
,
T
**
dwPtr
,
T
**
dbPtr
);
T
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
std
::
vector
<
at
::
Tensor
>
mlp_forward
(
int
use_bias
,
int
activation
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
std
::
vector
<
at
::
Tensor
>
mlp_forward
(
std
::
vector
<
at
::
Tensor
>
inputs
)
{
auto
num_layers
=
inputs
.
size
()
-
1
;
// inputs contains (input, weights, biases)
if
(
use_bias
)
{
auto
num_layers
=
(
inputs
.
size
()
-
1
)
/
2
;
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
...
@@ -60,7 +69,9 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
...
@@ -60,7 +69,9 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
std
::
vector
<
scalar_t
*>
b_ptr
;
std
::
vector
<
scalar_t
*>
b_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
if
(
use_bias
)
{
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
}
}
}
auto
result
=
mlp_fp
<
scalar_t
>
(
auto
result
=
mlp_fp
<
scalar_t
>
(
inputs
[
0
].
data_ptr
<
scalar_t
>
(),
inputs
[
0
].
data_ptr
<
scalar_t
>
(),
...
@@ -71,37 +82,48 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
...
@@ -71,37 +82,48 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
output_features
.
data
(),
output_features
.
data
(),
b_ptr
.
data
(),
b_ptr
.
data
(),
out
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
scalar_t
>
(),
reserved_space
.
data_ptr
<
scalar_t
>
());
reserved_space
.
data_ptr
<
scalar_t
>
(),
use_bias
,
activation
);
});
});
return
{
out
,
reserved_space
};
return
{
out
,
reserved_space
};
}
}
std
::
vector
<
at
::
Tensor
>
mlp_backward
(
std
::
vector
<
at
::
Tensor
>
mlp_backward
(
at
::
Tensor
grad_o
,
int
use_bias
,
std
::
vector
<
at
::
Tensor
>
fprop_outputs
,
int
activation
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
at
::
Tensor
grad_o
,
// same code to get sizes and W pointers
std
::
vector
<
at
::
Tensor
>
fprop_outputs
,
auto
num_layers
=
(
inputs
.
size
()
-
1
)
/
2
;
std
::
vector
<
at
::
Tensor
>
inputs
)
{
auto
num_layers
=
inputs
.
size
()
-
1
;
if
(
use_bias
)
{
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
// TODO: not creating empty tensor for it?
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
vector
<
int
>
output_features
;
std
::
vector
<
int
>
output_features
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
output_features
.
push_back
(
inputs
[
i
+
1
].
size
(
0
));
output_features
.
push_back
(
inputs
[
i
+
1
].
size
(
0
));
}
}
// create outputs, length of inputs
// create outputs, length of inputs
// TODO: not create bias if not needed
std
::
vector
<
at
::
Tensor
>
outputs
;
std
::
vector
<
at
::
Tensor
>
outputs
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
outputs
.
push_back
(
at
::
empty
(
inputs
[
i
].
sizes
(),
inputs
[
i
].
type
()));
// clone for testing now
outputs
.
push_back
(
at
::
empty
(
inputs
[
i
].
sizes
(),
inputs
[
i
].
type
()));
// clone for testing now
}
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_
for
ward"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_
back
ward"
,
[
&
]
{
std
::
vector
<
scalar_t
*>
w_ptr
;
std
::
vector
<
scalar_t
*>
w_ptr
;
std
::
vector
<
scalar_t
*>
b_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
}
}
std
::
vector
<
scalar_t
*>
outputs_ptr
;
std
::
vector
<
scalar_t
*>
outputs_ptr
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
...
@@ -127,7 +149,10 @@ std::vector<at::Tensor> mlp_backward(
...
@@ -127,7 +149,10 @@ std::vector<at::Tensor> mlp_backward(
work_space
.
data_ptr
<
scalar_t
>
(),
work_space
.
data_ptr
<
scalar_t
>
(),
outputs_ptr
[
0
],
outputs_ptr
[
0
],
outputs_ptr
.
data
()
+
1
,
outputs_ptr
.
data
()
+
1
,
outputs_ptr
.
data
()
+
1
+
num_layers
);
outputs_ptr
.
data
()
+
1
+
num_layers
,
requires_grad
,
use_bias
,
activation
);
});
});
return
outputs
;
return
outputs
;
...
...
csrc/mlp_cuda.cu
View file @
6e14df49
...
@@ -10,8 +10,11 @@
...
@@ -10,8 +10,11 @@
#include <cublas_v2.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#define BIASADDRELU_FPROP_NUM_THREADS 128
// constants for fused bias+relu kernel
#define BIASADDRELU_BPROP_NUM_THREADS 128
#define BIAS_RELU_FW_NTHREADS 128 // forward number of thread per block
#define BIAS_RELU_BW_NTHREADS_X 32 // backward number of thread in feature dim
#define BIAS_RELU_BW_NTHREADS_Y 16 // backward number of thread in batch dim
#define BIAS_RELU_RED_PER_THREAD 16 // backward minimal reduction length per thread
// move to a header later on
// move to a header later on
#define ILP 4
#define ILP 4
...
@@ -42,6 +45,12 @@ __device__ __inline__ float relu(float a) {
...
@@ -42,6 +45,12 @@ __device__ __inline__ float relu(float a) {
return
(
retf
);
return
(
retf
);
}
}
// Keep Sigmoid in float only. When using half, cast to float before calling.
__device__
__inline__
float
sigmoid
(
float
a
)
{
float
retf
=
1.
f
/
(
1.
f
+
expf
(
-
a
));
return
(
retf
);
}
// FP64 Wrapper around cublas GEMMEx
// FP64 Wrapper around cublas GEMMEx
cublasStatus_t
mlp_gemm
(
cublasStatus_t
mlp_gemm
(
cublasHandle_t
handle
,
cublasHandle_t
handle
,
...
@@ -156,9 +165,55 @@ cublasStatus_t mlp_gemm(
...
@@ -156,9 +165,55 @@ cublasStatus_t mlp_gemm(
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
}
}
// Bias ADD
+ ReLU
. Assume input X is [features x batch size],
assume
column major.
// Bias ADD. Assume input X is [features x batch size], column major.
// Bias is one 'features' long vector, with implicit broadcast.
// Bias is one 'features' long vector, with implicit broadcast.
// Currently, activation support fuesed ReLU. Safe to call in-place.
template
<
typename
T
>
__global__
void
biasAdd_fprop
(
T
*
X
,
T
*
b
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
T
r_b
[
ILP
];
if
(
is_aligned
(
X
)
&&
is_aligned
(
b
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
row
=
tid
%
(
features
/
ILP
);
load_store
(
r_x
,
X
,
0
,
tid
);
load_store
(
r_b
,
b
,
0
,
row
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
bias_sum
;
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
int
row
=
tid
%
features
;
r_x
[
ii
]
=
X
[
idx
];
r_b
[
ii
]
=
b
[
row
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
bias_sum
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// Bias ADD + ReLU. Assume input X is [features x batch size], column major.
// Activation support fuesed ReLU. Safe to call in-place.
template
<
typename
T
>
template
<
typename
T
>
__global__
void
biasAddRelu_fprop
(
T
*
X
,
T
*
b
,
uint
batch_size
,
uint
features
)
{
__global__
void
biasAddRelu_fprop
(
T
*
X
,
T
*
b
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
T
r_x
[
ILP
];
...
@@ -204,32 +259,308 @@ __global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
...
@@ -204,32 +259,308 @@ __global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
}
}
}
}
// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Relu_fprop
(
T
*
X
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
if
(
is_aligned
(
X
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_x
,
X
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
relu
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_x
[
ii
]
=
X
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
relu
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Sigmoid_fprop
(
T
*
X
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
if
(
is_aligned
(
X
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_x
,
X
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
sigmoid
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_x
[
ii
]
=
X
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
sigmoid
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Relu_bprop
(
T
*
dY
,
T
*
Y
,
uint
batch_size
,
uint
features
,
T
*
dX
)
{
T
r_dy
[
ILP
];
T
r_y
[
ILP
];
if
(
is_aligned
(
dY
)
&&
is_aligned
(
Y
)
&&
is_aligned
(
dX
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_dy
,
dY
,
0
,
tid
);
load_store
(
r_y
,
Y
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
}
load_store
(
dX
,
r_dy
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_dy
[
ii
]
=
dY
[
idx
];
r_y
[
ii
]
=
Y
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
dX
[
idx
]
=
r_dy
[
ii
];
}
}
}
}
}
// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Sigmoid_bprop
(
T
*
dY
,
T
*
Y
,
uint
batch_size
,
uint
features
,
T
*
dX
)
{
T
r_dy
[
ILP
];
T
r_y
[
ILP
];
if
(
is_aligned
(
dY
)
&&
is_aligned
(
Y
)
&&
is_aligned
(
dX
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_dy
,
dY
,
0
,
tid
);
load_store
(
r_y
,
Y
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
float
grad_out
=
r_dy
[
ii
];
float
out
=
r_y
[
ii
];
float
grad_i
=
out
*
(
1.
f
-
out
)
*
grad_out
;
r_dy
[
ii
]
=
grad_i
;
}
load_store
(
dX
,
r_dy
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_dy
[
ii
]
=
dY
[
idx
];
r_y
[
ii
]
=
Y
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
grad_out
=
r_dy
[
ii
];
float
out
=
r_y
[
ii
];
float
grad_i
=
out
*
(
1.
f
-
out
)
*
grad_out
;
r_dy
[
ii
]
=
grad_i
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
dX
[
idx
]
=
r_dy
[
ii
];
}
}
}
}
}
// Compute grid size for pointwise backward kernel.
// Compute grid size for pointwise backward kernel.
//
Some intelligence needed to determine number of splits for reduction.
//
block_x/y is total elment being handled per block, not number of threads
void
get_biasAddRelu_bprop_grid_size
(
void
get_biasAddRelu_bprop_grid_size
(
int
yfeat
,
int
yfeat
,
int
threadsPerBlock
,
int
batch_size
,
int
batch_size
,
int
block_x
,
int
block_y
,
int
*
grid_x
,
int
*
grid_x
,
int
*
grid_y
)
{
int
*
grid_y
)
{
*
grid_x
=
(
yfeat
+
block_x
-
1
)
/
block_x
;
// Get number of SMs for efficient reduction.
// Get number of SMs for efficient reduction.
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
// First preference, whole reduction in 1 CTA
// can switch to occupancy calculation. use 4 below now for sm_70
int
nBlocks
=
(
yfeat
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
int
max_blocks_y
=
num_SMs
*
4
/
(
*
grid_x
);
// block_y should be from minimal work per thread
// Figure out how many splits to divide reduction into. At least 32 elements per CTA.
int
nRedSplits
=
(
batch_size
+
block_y
-
1
)
/
block_y
;
// we want grid_y as close to sqrt(batchsize)?
// increase number of elem per thread redcution to not launch more than enough
int
nRedSplits
=
std
::
sqrt
(
batch_size
);
// kernel adjust work, so here we just launch max block
// for batchsize <=64, just use 1 block
*
grid_y
=
std
::
min
(
nRedSplits
,
max_blocks_y
);
if
(
batch_size
<
64
)
nRedSplits
=
1
;
// no need to go over occupancy
nRedSplits
=
min
((
8
*
num_SMs
)
/
nBlocks
,
nRedSplits
);
*
grid_x
=
nBlocks
;
*
grid_y
=
nRedSplits
;
return
;
return
;
}
}
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template
<
typename
T
,
int
UNROLL_FACTOR
>
__global__
void
biasAdd_bprop
(
T
*
dY
,
int
features
,
int
batch_size
,
volatile
float
*
intermediate
,
int
*
semaphores
,
T
*
db
)
{
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the span this thread is responsible for
// For this block
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
__shared__
bool
isLastBlock
;
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
];
// Accumulate db in FP32 always
float
db_local
=
0
;
if
(
f
<
features
)
{
int
nidx
=
0
;
// Handle non-multiple of UNROLL_FACTOR residue
for
(;
nidx
<
nSpan
%
UNROLL_FACTOR
;
nidx
++
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
db_local
+=
(
float
)
dY
[
flat_idx
];
}
// Handle meat of work
for
(;
(
nidx
+
UNROLL_FACTOR
-
1
)
<
nSpan
;
nidx
+=
UNROLL_FACTOR
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
#pragma unroll 4
for
(
int
u
=
0
;
u
<
UNROLL_FACTOR
;
u
++
)
{
db_local
+=
(
float
)
dY
[
flat_idx
];
flat_idx
+=
features
;
}
}
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
smem
[
linear_idx
]
=
db_local
;
}
__syncthreads
();
if
(
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
db_local
+=
smem
[
yidx
*
blockDim
.
x
+
threadIdx
.
x
];
}
// block result is in db_local now for all threadIdx.y == 0
// Write out partial result
out
[
f
]
=
db_local
;
}
}
__threadfence
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in the grid_y dimension.
// Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
&&
f
<
features
)
{
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
}
__syncthreads
();
db_local
=
0
;
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
&&
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
}
db
[
f
]
=
(
T
)
db_local
;
}
}
}
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template
<
typename
T
,
int
UNROLL_FACTOR
>
template
<
typename
T
,
int
UNROLL_FACTOR
>
...
@@ -245,14 +576,22 @@ __global__ void biasAddRelu_bprop(
...
@@ -245,14 +576,22 @@ __global__ void biasAddRelu_bprop(
// The feature that this thread is responsible for
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the batch span this thread is responsible for
// Compute the span this thread is responsible for
int
chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
// For this block
int
nStart
=
blockIdx
.
y
*
chunkSize
;
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
nSpan
=
min
(
batch_size
,
nStart
+
chunkSize
)
-
nStart
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
// Flag to trigger last reduction.
__shared__
bool
isLastBlock
;
__shared__
bool
isLastBlock
;
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
];
// Accumulate db in FP32 always
// Accumulate db in FP32 always
float
db_local
=
0
;
float
db_local
=
0
;
...
@@ -296,15 +635,28 @@ __global__ void biasAddRelu_bprop(
...
@@ -296,15 +635,28 @@ __global__ void biasAddRelu_bprop(
}
}
}
}
// Write out partial result
// naive block reduction on y-dim
out
[
f
]
=
db_local
;
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
smem
[
linear_idx
]
=
db_local
;
}
__syncthreads
();
if
(
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
db_local
+=
smem
[
yidx
*
blockDim
.
x
+
threadIdx
.
x
];
}
// block result is in db_local now for all threadIdx.y == 0
// Write out partial result
out
[
f
]
=
db_local
;
}
}
}
__threadfence
();
__threadfence
();
__syncthreads
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in
// Increment semaphore and check if this is the last CTA in
the grid_y dimension.
//
the grid_y dimension.
//
Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
f
<
features
)
{
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
&&
f
<
features
)
{
unsigned
int
sum_idx
;
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
...
@@ -312,14 +664,17 @@ __global__ void biasAddRelu_bprop(
...
@@ -312,14 +664,17 @@ __global__ void biasAddRelu_bprop(
__syncthreads
();
__syncthreads
();
db_local
=
0
;
db_local
=
0
;
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
&&
f
<
features
)
{
if
(
isLastBlock
&&
f
<
features
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
if
(
threadIdx
.
y
==
0
)
{
int
row
,
col
;
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
row
=
f
;
int
row
,
col
;
col
=
n
;
row
=
f
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
col
=
n
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
}
db
[
f
]
=
(
T
)
db_local
;
}
}
db
[
f
]
=
(
T
)
db_local
;
}
}
}
}
...
@@ -338,10 +693,16 @@ __global__ void biasAddRelu_bprop_aligned(
...
@@ -338,10 +693,16 @@ __global__ void biasAddRelu_bprop_aligned(
// The feature that this thread is responsible for
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the batch span this thread is responsible for
// Compute the span this thread is responsible for
int
chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
// For this block
int
nStart
=
blockIdx
.
y
*
chunkSize
;
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
nSpan
=
min
(
batch_size
,
nStart
+
chunkSize
)
-
nStart
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
// Flag to trigger last reduction.
...
@@ -399,24 +760,45 @@ __global__ void biasAddRelu_bprop_aligned(
...
@@ -399,24 +760,45 @@ __global__ void biasAddRelu_bprop_aligned(
}
}
}
}
if
(
gridDim
.
y
==
1
)
{
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
*
ILP
];
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
float
*
smem_out
=
smem
+
ILP
*
linear_idx
;
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
smem_out
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
return
;
}
}
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
float
*
smem_in
=
smem
+
ILP
*
(
yidx
*
blockDim
.
x
+
threadIdx
.
x
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
+=
smem_in
[
ii
];
// reuse local dy buffer
}
}
// Write out partial result
// block result is in db_local now for all threadIdx.y == 0
load_store
(
out
,
db_local
,
f
,
0
);
// TODO: maybe not useful early exit here
if
(
gridDim
.
y
==
1
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
return
;
}
// Write out partial result
load_store
(
out
,
db_local
,
f
,
0
);
}
__threadfence
();
__threadfence
();
__syncthreads
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in
// Increment semaphore and check if this is the last CTA in
the grid_y dimension.
//
the grid_y dimension.
//
Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
unsigned
int
sum_idx
;
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
...
@@ -428,22 +810,26 @@ __global__ void biasAddRelu_bprop_aligned(
...
@@ -428,22 +810,26 @@ __global__ void biasAddRelu_bprop_aligned(
db_local
[
ii
]
=
0.
f
;
db_local
[
ii
]
=
0.
f
;
}
}
float
r_db
[
ILP
];
float
r_db
[
ILP
];
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
)
{
if
(
isLastBlock
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
if
(
threadIdx
.
y
==
0
){
int
row
,
col
;
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
row
=
f
;
int
row
,
col
;
col
=
n
;
row
=
f
;
load_store
(
r_db
,
intermediate
,
0
,
col
*
features
/
ILP
+
row
);
col
=
n
;
load_store
(
r_db
,
intermediate
,
0
,
col
*
features
/
ILP
+
row
);
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
+=
r_db
[
ii
];
db_local
[
ii
]
+=
r_db
[
ii
];
}
}
}
}
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
}
}
load_store
(
db
,
r_dy
,
f
,
0
);
}
}
}
}
...
@@ -502,10 +888,20 @@ size_t get_reduction_scratch_space(int batch_size, int num_layers, const int* ou
...
@@ -502,10 +888,20 @@ size_t get_reduction_scratch_space(int batch_size, int num_layers, const int* ou
size_t
max_scratch_space
=
0
;
size_t
max_scratch_space
=
0
;
// Loop over all layers to see which one needs the max scratch space
// Loop over all layers to see which one needs the max scratch space
for
(
int
l
=
0
;
l
<
num_layers
;
l
++
)
{
for
(
int
l
=
0
;
l
<
num_layers
;
l
++
)
{
int
tmp
,
num_splits
;
// need to find max(aligned, not_aligned)
int
tmp
,
res0
,
res1
;
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
output_features
[
l
],
batch_size
,
block_x
,
block_y
,
&
tmp
,
&
res0
);
block_x
=
ILP
*
BIAS_RELU_BW_NTHREADS_X
;
get_biasAddRelu_bprop_grid_size
(
get_biasAddRelu_bprop_grid_size
(
output_features
[
l
],
BIASADDRELU_BPROP_NUM_THREADS
,
batch_size
,
&
tmp
,
&
num_splits
);
output_features
[
l
],
batch_size
,
block_x
,
block_y
,
&
tmp
,
&
res1
);
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
num_splits
));
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
res0
));
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
res1
));
}
}
return
max_scratch_space
;
return
max_scratch_space
;
...
@@ -581,7 +977,9 @@ int mlp_fp(
...
@@ -581,7 +977,9 @@ int mlp_fp(
int
*
output_features
,
int
*
output_features
,
T
**
BPtr
,
T
**
BPtr
,
T
*
Y
,
T
*
Y
,
T
*
reserved_space
)
{
T
*
reserved_space
,
int
use_bias
,
int
activation
)
{
T
*
weight
,
*
input
,
*
output
,
*
bias
;
T
*
weight
,
*
input
,
*
output
,
*
bias
;
T
*
reserved_space_x
,
*
reserved_space_y
;
T
*
reserved_space_x
,
*
reserved_space_y
;
reserved_space_x
=
NULL
;
reserved_space_x
=
NULL
;
...
@@ -597,7 +995,9 @@ int mlp_fp(
...
@@ -597,7 +995,9 @@ int mlp_fp(
weight
=
WPtr
[
layer
];
weight
=
WPtr
[
layer
];
input
=
(
layer
==
0
)
?
X
:
reserved_space_x
;
input
=
(
layer
==
0
)
?
X
:
reserved_space_x
;
output
=
(
layer
==
num_layers
-
1
)
?
Y
:
reserved_space_y
;
output
=
(
layer
==
num_layers
-
1
)
?
Y
:
reserved_space_y
;
bias
=
BPtr
[
layer
];
if
(
use_bias
)
{
bias
=
BPtr
[
layer
];
}
int
ifeat
=
(
layer
==
0
)
?
input_features
:
output_features
[
layer
-
1
];
int
ifeat
=
(
layer
==
0
)
?
input_features
:
output_features
[
layer
-
1
];
int
ofeat
=
output_features
[
layer
];
int
ofeat
=
output_features
[
layer
];
...
@@ -627,12 +1027,33 @@ int mlp_fp(
...
@@ -627,12 +1027,33 @@ int mlp_fp(
return
1
;
return
1
;
}
}
// Call biasReLU
const
uint
&
input_size
=
ofeat
;
const
uint
&
input_size
=
ofeat
;
int
num_blocks
=
0
;
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAddRelu_fprop
<
T
>
,
BIASADDRELU_FPROP_NUM_THREADS
,
0
);
// Call biasReLU
biasAddRelu_fprop
<<<
num_SMs
*
num_blocks
,
BIASADDRELU_FPROP_NUM_THREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
if
(
use_bias
==
1
)
{
if
(
activation
==
0
)
{
// no activation
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAdd_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAdd_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
}
else
if
(
activation
==
1
)
{
// relu
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAddRelu_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAddRelu_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
}
else
if
(
activation
==
2
)
{
// sigmoid
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAdd_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAdd_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
}
else
{
// don't need to do anything in case of no activation and no bias
if
(
activation
==
1
)
{
// relu
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Relu_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Relu_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
else
if
(
activation
==
2
)
{
// sigmoid
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
}
// Set current output as next layer input
// Set current output as next layer input
reserved_space_x
=
reserved_space_y
;
reserved_space_x
=
reserved_space_y
;
...
@@ -660,7 +1081,10 @@ int mlp_bp(
...
@@ -660,7 +1081,10 @@ int mlp_bp(
T
*
work_space
,
T
*
work_space
,
T
*
dX
,
T
*
dX
,
T
**
dwPtr
,
T
**
dwPtr
,
T
**
dbPtr
)
{
T
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
)
{
T
*
weight
;
T
*
weight
;
T
*
dweight
,
*
dx
,
*
dy
,
*
dbias
;
T
*
dweight
,
*
dx
,
*
dy
,
*
dbias
;
T
*
x
,
*
y
;
T
*
x
,
*
y
;
...
@@ -719,32 +1143,85 @@ int mlp_bp(
...
@@ -719,32 +1143,85 @@ int mlp_bp(
float
one
=
1.
f
;
float
one
=
1.
f
;
float
zero
=
0.
f
;
float
zero
=
0.
f
;
// Call bias ReLU backprop - first implementation, 1 thread per bias element
if
(
use_bias
==
1
)
{
int
threadsPerBlock
=
BIASADDRELU_BPROP_NUM_THREADS
;
if
(
activation
==
0
)
{
// no acitvation
int
grid_x
,
grid_y
;
// bgrad
get_biasAddRelu_bprop_grid_size
(
yfeat
,
threadsPerBlock
,
batch_size
,
&
grid_x
,
&
grid_y
);
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
dim3
block
(
threadsPerBlock
);
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
if
(
yfeat
%
(
ILP
*
threadsPerBlock
)
==
0
&&
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
is_aligned
(
y
)
&&
dim3
grid
(
grid_x
,
grid_y
);
is_aligned
(
dy
)
&&
biasAdd_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
is_aligned
(
dy_gemm
)
&&
dy
,
yfeat
,
batch_size
,
db_scratch
,
semaphores
,
dbias
);
is_aligned
(
dbias
)){
// bypass dgrad through reset pointer
dim3
grid
(
grid_x
/
ILP
,
grid_y
);
dy_gemm
=
dy
;
biasAddRelu_bprop_aligned
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
}
else
if
(
activation
==
1
)
{
// relu
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
}
else
{
int
grid_x
,
grid_y
;
dim3
grid
(
grid_x
,
grid_y
);
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
biasAddRelu_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
if
(
yfeat
%
(
ILP
*
BIAS_RELU_BW_NTHREADS_X
)
==
0
&&
is_aligned
(
y
)
&&
is_aligned
(
dy
)
&&
is_aligned
(
dy_gemm
)
&&
is_aligned
(
dbias
)){
int
block_x
=
ILP
*
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAddRelu_bprop_aligned
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
}
else
{
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAddRelu_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
}
}
else
if
(
activation
==
2
)
{
// sigmoid
// activation backward
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
// bgrad, from dy_gemm
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAdd_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
dy_gemm
,
yfeat
,
batch_size
,
db_scratch
,
semaphores
,
dbias
);
}
}
else
{
// no bias below
if
(
activation
==
0
)
{
// bypass dgrad through reset pointer
dy_gemm
=
dy
;
}
else
if
(
activation
==
1
)
{
// relu
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Relu_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Relu_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
}
else
if
(
activation
==
2
)
{
// sigmoid
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
}
}
}
cublasStatus_t
cublas_status
;
cublasStatus_t
cublas_status
;
// Call GEMM dgrad
// Call GEMM dgrad
cublas_status
=
mlp_gemm
(
if
(
layer
>
0
||
requires_grad
==
1
)
{
cublas_status
=
mlp_gemm
(
handle
,
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
...
@@ -760,9 +1237,10 @@ int mlp_bp(
...
@@ -760,9 +1237,10 @@ int mlp_bp(
dx
,
dx
,
xfeat
);
xfeat
);
if
(
cublas_status
!=
CUBLAS_STATUS_SUCCESS
)
{
if
(
cublas_status
!=
CUBLAS_STATUS_SUCCESS
)
{
printf
(
"GEMM dgrad failed with %d
\n
"
,
cublas_status
);
printf
(
"GEMM dgrad failed with %d
\n
"
,
cublas_status
);
return
1
;
return
1
;
}
}
}
// Call GEMM wgrad
// Call GEMM wgrad
...
@@ -801,7 +1279,9 @@ template int mlp_fp<float>(
...
@@ -801,7 +1279,9 @@ template int mlp_fp<float>(
int
*
output_features
,
int
*
output_features
,
float
**
BPtr
,
float
**
BPtr
,
float
*
Y
,
float
*
Y
,
float
*
reserved_space
);
float
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
float
>(
template
int
mlp_bp
<
float
>(
float
*
X
,
float
*
X
,
...
@@ -816,7 +1296,10 @@ template int mlp_bp<float>(
...
@@ -816,7 +1296,10 @@ template int mlp_bp<float>(
float
*
work_space
,
float
*
work_space
,
float
*
dX
,
float
*
dX
,
float
**
dwPtr
,
float
**
dwPtr
,
float
**
dbPtr
);
float
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
int
mlp_fp
<
at
::
Half
>(
template
int
mlp_fp
<
at
::
Half
>(
at
::
Half
*
X
,
at
::
Half
*
X
,
...
@@ -827,7 +1310,9 @@ template int mlp_fp<at::Half>(
...
@@ -827,7 +1310,9 @@ template int mlp_fp<at::Half>(
int
*
output_features
,
int
*
output_features
,
at
::
Half
**
BPtr
,
at
::
Half
**
BPtr
,
at
::
Half
*
Y
,
at
::
Half
*
Y
,
at
::
Half
*
reserved_space
);
at
::
Half
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
at
::
Half
>(
template
int
mlp_bp
<
at
::
Half
>(
at
::
Half
*
X
,
at
::
Half
*
X
,
...
@@ -842,7 +1327,10 @@ template int mlp_bp<at::Half>(
...
@@ -842,7 +1327,10 @@ template int mlp_bp<at::Half>(
at
::
Half
*
work_space
,
at
::
Half
*
work_space
,
at
::
Half
*
dX
,
at
::
Half
*
dX
,
at
::
Half
**
dwPtr
,
at
::
Half
**
dwPtr
,
at
::
Half
**
dbPtr
);
at
::
Half
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
int
mlp_fp
<
double
>(
template
int
mlp_fp
<
double
>(
double
*
X
,
double
*
X
,
...
@@ -853,7 +1341,9 @@ template int mlp_fp<double>(
...
@@ -853,7 +1341,9 @@ template int mlp_fp<double>(
int
*
output_features
,
int
*
output_features
,
double
**
BPtr
,
double
**
BPtr
,
double
*
Y
,
double
*
Y
,
double
*
reserved_space
);
double
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
double
>(
template
int
mlp_bp
<
double
>(
double
*
X
,
double
*
X
,
...
@@ -868,7 +1358,10 @@ template int mlp_bp<double>(
...
@@ -868,7 +1358,10 @@ template int mlp_bp<double>(
double
*
work_space
,
double
*
work_space
,
double
*
dX
,
double
*
dX
,
double
**
dwPtr
,
double
**
dwPtr
,
double
**
dbPtr
);
double
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
size_t
get_mlp_bp_workspace_in_bytes
<
float
>(
template
size_t
get_mlp_bp_workspace_in_bytes
<
float
>(
int
batch_size
,
int
batch_size
,
...
...
csrc/multi_tensor_axpby_kernel.cu
View file @
6e14df49
...
@@ -13,6 +13,17 @@
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define BLOCK_SIZE 512
#define ILP 4
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
,
typename
y_t
,
typename
out_t
>
template
<
typename
x_t
,
typename
y_t
,
typename
out_t
>
struct
AxpbyFunctor
struct
AxpbyFunctor
{
{
...
@@ -43,46 +54,74 @@ struct AxpbyFunctor
...
@@ -43,46 +54,74 @@ struct AxpbyFunctor
n
-=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
// Non-divergent exit condition for __syncthreads, not necessary here
bool
finite
=
true
;
float
xs
[
ILP
];
x_t
r_x
[
ILP
];
float
ys
[
ILP
];
y_t
r_y
[
ILP
];
for
(
int
i_start
=
0
;
out_t
r_out
[
ILP
];
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
)
&&
is_aligned
(
y
)
&&
is_aligned
(
out
))
{
{
#pragma unroll
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
xs
[
ii
]
=
0
;
// load
ys
[
ii
]
=
0
;
load_store
(
r_x
,
x
,
0
,
i_start
);
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
load_store
(
r_y
,
y
,
0
,
i_start
);
if
(
i
<
n
&&
i
<
chunk_size
)
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
xs
[
ii
]
=
static_cast
<
float
>
(
x
[
i
]);
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
ys
[
ii
]
=
static_cast
<
float
>
(
y
[
i
]);
if
(
arg_to_check
==
-
1
)
finite
=
finite
&&
(
isfinite
(
r_x
[
ii
])
&&
isfinite
(
r_y
[
ii
]));
if
(
arg_to_check
==
0
)
finite
=
finite
&&
isfinite
(
r_x
[
ii
]);
if
(
arg_to_check
==
1
)
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
}
}
// store
load_store
(
out
,
r_out
,
i_start
,
0
);
}
}
}
// see note in multi_tensor_scale_kernel.cu
else
#pragma unroll
{
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
// Non-divergent exit condition for __syncthreads, not necessary here
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
#pragma unroll
if
(
i
<
n
&&
i
<
chunk_size
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
out
[
i
]
=
static_cast
<
out_t
>
(
a
*
xs
[
ii
]
+
b
*
ys
[
ii
]);
r_x
[
ii
]
=
0
;
bool
finite
=
true
;
r_y
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_x
[
ii
]
=
x
[
i
];
r_y
[
ii
]
=
y
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
if
(
arg_to_check
==
-
1
)
if
(
arg_to_check
==
-
1
)
finite
=
(
isfinite
(
x
s
[
ii
])
&&
isfinite
(
y
s
[
ii
]));
finite
=
finite
&&
(
isfinite
(
r_
x
[
ii
])
&&
isfinite
(
r_
y
[
ii
]));
if
(
arg_to_check
==
0
)
if
(
arg_to_check
==
0
)
finite
=
isfinite
(
x
s
[
ii
]);
finite
=
finite
&&
isfinite
(
r_
x
[
ii
]);
if
(
arg_to_check
==
1
)
if
(
arg_to_check
==
1
)
finite
=
isfinite
(
ys
[
ii
]);
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
if
(
!
finite
)
}
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
}
}
}
}
}
}
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
}
};
};
...
...
csrc/multi_tensor_l2norm_kernel.cu
View file @
6e14df49
...
@@ -13,6 +13,17 @@
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define BLOCK_SIZE 512
#define ILP 4
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
template
<
typename
x_t
>
struct
L2NormFunctor
struct
L2NormFunctor
{
{
...
@@ -41,22 +52,44 @@ struct L2NormFunctor
...
@@ -41,22 +52,44 @@ struct L2NormFunctor
__shared__
float
s_vals
[
512
];
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
{
#pragma unroll
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
// load
if
(
i
<
n
&&
i
<
chunk_size
)
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
float
next
=
static_cast
<
float
>
(
r_
x
[
i
i
]);
vals
[
ii
]
+=
next
*
next
;
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
float
val
=
0.
f
;
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
...
@@ -104,22 +137,44 @@ struct MaxNormFunctor
...
@@ -104,22 +137,44 @@ struct MaxNormFunctor
__shared__
float
s_vals
[
512
];
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
{
#pragma unroll
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
// load
if
(
i
<
n
&&
i
<
chunk_size
)
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
float
next
=
static_cast
<
float
>
(
r_
x
[
i
i
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
}
}
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
}
}
}
float
val
=
0.
f
;
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
...
...
csrc/multi_tensor_lamb.cu
View file @
6e14df49
...
@@ -13,6 +13,17 @@
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define BLOCK_SIZE 512
#define ILP 4
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
typedef
enum
{
typedef
enum
{
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
...
@@ -68,71 +79,149 @@ struct LAMBStage1Functor
...
@@ -68,71 +79,149 @@ struct LAMBStage1Functor
n
-=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
// see note in multi_tensor_scale_kernel.cu
MATH_T
r_g
[
ILP
];
for
(
int
i_start
=
0
;
MATH_T
r_p
[
ILP
];
i_start
<
n
&&
i_start
<
chunk_size
;
MATH_T
r_m
[
ILP
];
i_start
+=
blockDim
.
x
*
ILP
)
MATH_T
r_v
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
g
)
&&
is_aligned
(
p
)
&&
is_aligned
(
m
)
&&
is_aligned
(
v
))
{
{
MATH_T
r_g
[
ILP
];
T
l_g
[
ILP
];
MATH_T
r_p
[
ILP
];
T
l_p
[
ILP
];
MATH_T
r_m
[
ILP
];
T
l_m
[
ILP
];
MATH_T
r_v
[
ILP
];
T
l_v
[
ILP
];
#pragma unroll
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
// load
if
(
i
<
n
&&
i
<
chunk_size
)
load_store
(
l_g
,
g
,
0
,
i_start
);
if
(
decay
!=
0
)
load_store
(
l_p
,
p
,
0
,
i_start
);
load_store
(
l_m
,
m
,
0
,
i_start
);
load_store
(
l_v
,
v
,
0
,
i_start
);
// unpack
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
r_g
[
ii
]
=
g
[
i
];
r_g
[
ii
]
=
l_g
[
ii
];
// special ?optimization? for lamb stage 1
if
(
decay
==
0
)
{
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
}
}
else
{
else
{
r_p
[
ii
]
=
p
[
i
];
r_p
[
ii
]
=
l_
p
[
i
i
];
}
}
r_m
[
ii
]
=
m
[
i
];
r_m
[
ii
]
=
l_m
[
ii
];
r_v
[
ii
]
=
v
[
i
];
r_v
[
ii
]
=
l_v
[
ii
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
}
}
}
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
if
(
mode
==
MOMENT_MODE_0
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
}
else
{
#pragma unroll
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
{
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
l_p
[
ii
]
=
r_p
[
ii
];
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
l_m
[
ii
]
=
r_m
[
ii
];
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
l_v
[
ii
]
=
r_v
[
ii
];
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
// store
load_store
(
g
,
l_p
,
i_start
,
0
);
load_store
(
m
,
l_m
,
i_start
,
0
);
load_store
(
v
,
l_v
,
i_start
,
0
);
}
}
#pragma unroll
}
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
else
{
// see note in multi_tensor_scale_kernel.cu
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
MATH_T
r_g
[
ILP
];
if
(
i
<
n
&&
i
<
chunk_size
)
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
g
[
i
]
=
r_p
[
ii
];
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
m
[
i
]
=
r_m
[
ii
];
if
(
i
<
n
&&
i
<
chunk_size
)
v
[
i
]
=
r_v
[
ii
];
{
r_g
[
ii
]
=
g
[
i
];
// special ?optimization? for lamb stage 1
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
p
[
i
];
}
r_m
[
ii
]
=
m
[
i
];
r_v
[
ii
]
=
v
[
i
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
g
[
i
]
=
r_p
[
ii
];
m
[
i
]
=
r_m
[
ii
];
v
[
i
]
=
r_v
[
ii
];
}
}
}
}
}
}
}
...
@@ -173,34 +262,58 @@ struct LAMBStage2Functor
...
@@ -173,34 +262,58 @@ struct LAMBStage2Functor
n
-=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
for
(
int
i_start
=
0
;
// to make things simple, we put aligned case in a different code path
i_start
<
n
&&
i_start
<
chunk_size
;
if
(
n
%
ILP
==
0
&&
i_start
+=
blockDim
.
x
*
ILP
)
chunk_size
%
ILP
==
0
&&
is_aligned
(
p
)
&&
is_aligned
(
update
))
{
{
MATH_T
r_p
[
ILP
];
T
r_p
[
ILP
];
MATH_T
r_update
[
ILP
];
T
r_update
[
ILP
];
#pragma unroll
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
// load
if
(
i
<
n
&&
i
<
chunk_size
)
load_store
(
r_p
,
p
,
0
,
i_start
);
load_store
(
r_update
,
update
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
r_p
[
ii
]
=
p
[
i
];
r_p
[
ii
]
=
static_cast
<
MATH_T
>
(
r_p
[
ii
])
-
(
ratio
*
static_cast
<
MATH_T
>
(
r_update
[
ii
]));
r_update
[
ii
]
=
update
[
i
];
}
}
load_store
(
p
,
r_p
,
i_start
,
0
);
}
}
#pragma unroll
}
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
])
;
MATH_T
r_p
[
ILP
]
;
}
MATH_T
r_update
[
ILP
];
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_p
[
ii
]
=
p
[
i
];
r_update
[
ii
]
=
update
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
p
[
i
]
=
r_p
[
ii
];
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
p
[
i
]
=
r_p
[
ii
];
}
}
}
}
}
}
}
...
...
csrc/multi_tensor_scale_kernel.cu
View file @
6e14df49
...
@@ -15,6 +15,17 @@
...
@@ -15,6 +15,17 @@
#define BLOCK_SIZE 512
#define BLOCK_SIZE 512
#define ILP 4
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
in_t
,
typename
out_t
>
template
<
typename
in_t
,
typename
out_t
>
struct
ScaleFunctor
struct
ScaleFunctor
{
{
...
@@ -34,44 +45,68 @@ struct ScaleFunctor
...
@@ -34,44 +45,68 @@ struct ScaleFunctor
in_t
*
in
=
(
in_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
in_t
*
in
=
(
in_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
in
+=
chunk_idx
*
chunk_size
;
in
+=
chunk_idx
*
chunk_size
;
out_t
*
out
=
(
out_t
*
)
tl
.
addresses
[
1
][
tensor_loc
];
out_t
*
out
=
(
out_t
*
)
tl
.
addresses
[
1
][
tensor_loc
];
out
+=
chunk_idx
*
chunk_size
;
out
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
// Non-divergent exit condition for __syncthreads, not necessary here
bool
finite
=
true
;
float
incoming_vals
[
ILP
];
in_t
r_in
[
ILP
];
for
(
int
i_start
=
0
;
out_t
r_out
[
ILP
];
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
in
)
&&
is_aligned
(
out
))
{
{
#pragma unroll
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
incoming_vals
[
ii
]
=
0
;
// load
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
load_store
(
r_in
,
in
,
0
,
i_start
);
if
(
i
<
n
&&
i
<
chunk_size
)
#pragma unroll
incoming_vals
[
ii
]
=
static_cast
<
float
>
(
in
[
i
]);
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
static_cast
<
float
>
(
r_in
[
ii
])
*
scale
;
finite
=
finite
&&
isfinite
(
r_in
[
ii
]);
}
// store
load_store
(
out
,
r_out
,
i_start
,
0
);
}
}
}
// note for clarification to future michael:
else
// From a pure memory dependency perspective, there's likely no point unrolling
{
// the write loop, since writes just fire off once their LDGs arrive.
// Non-divergent exit condition for __syncthreads, not necessary here
// Put another way, the STGs are dependent on the LDGs, but not on each other.
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
#pragma unroll
if
(
i
<
n
&&
i
<
chunk_size
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_in
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
r_in
[
ii
]
=
in
[
i
];
}
// note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point unrolling
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
static_cast
<
float
>
(
r_in
[
ii
])
*
scale
;
finite
=
finite
&&
isfinite
(
r_in
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
{
out
[
i
]
=
static_cast
<
out_t
>
(
incoming_vals
[
ii
]
*
scale
)
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
!
isfinite
(
incoming_vals
[
ii
])
)
if
(
i
<
n
&&
i
<
chunk_size
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
out
[
i
]
=
r_out
[
ii
];
}
}
}
}
}
}
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
}
};
};
...
...
setup.py
View file @
6e14df49
...
@@ -177,7 +177,13 @@ if "--cuda_ext" in sys.argv:
...
@@ -177,7 +177,13 @@ if "--cuda_ext" in sys.argv:
'-O3'
,
'-O3'
,
'--use_fast_math'
]
+
version_dependent_macros
}))
'--use_fast_math'
]
+
version_dependent_macros
}))
else
:
else
:
print
(
"INFO: Skipping FusedLayerNorm extension."
)
print
(
"INFO: Building FusedLayerNorm extension."
)
ext_modules
.
append
(
CUDAExtension
(
name
=
'fused_layer_norm_cuda'
,
sources
=
[
'csrc/layer_norm_cuda.cpp'
,
'csrc/hip/layer_norm_hip_kernel.hip'
],
extra_compile_args
=
{
'cxx'
:
[
'-O3'
]
+
version_dependent_macros
,
'nvcc'
:
[]}))
if
not
is_rocm_pytorch
:
if
not
is_rocm_pytorch
:
ext_modules
.
append
(
ext_modules
.
append
(
...
...
tests/L0/run_mlp/test_mlp.py
View file @
6e14df49
...
@@ -51,6 +51,116 @@ class TestMLP(unittest.TestCase):
...
@@ -51,6 +51,116 @@ class TestMLP(unittest.TestCase):
ref_mlp
[
0
].
bias
.
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
bias
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_no_bias
(
self
):
for
use_activation
in
[
'none'
,
'relu'
,
'sigmoid'
]:
mlp
=
MLP
(
mlp_sizes
,
bias
=
False
,
activation
=
use_activation
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
],
bias
=
False
)
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp_layers
.
append
(
linear
)
if
use_activation
==
'relu'
:
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
if
use_activation
==
'sigmoid'
:
mlp_layers
.
append
(
nn
.
Sigmoid
())
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
).
requires_grad_
()
ref_input
=
test_input
.
clone
().
detach
().
requires_grad_
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
test_input
.
grad
.
detach
().
cpu
().
numpy
(),
ref_input
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
0
,
rtol
=
100
)
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
100
)
def
test_with_bias
(
self
):
for
use_activation
in
[
'none'
,
'relu'
,
'sigmoid'
]:
mlp
=
MLP
(
mlp_sizes
,
bias
=
True
,
activation
=
use_activation
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
],
bias
=
True
)
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp
.
biases
[
i
].
data
.
copy_
(
linear
.
bias
)
mlp_layers
.
append
(
linear
)
if
use_activation
==
'relu'
:
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
if
use_activation
==
'sigmoid'
:
mlp_layers
.
append
(
nn
.
Sigmoid
())
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
).
requires_grad_
()
ref_input
=
test_input
.
clone
().
detach
().
requires_grad_
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
test_input
.
grad
.
detach
().
cpu
().
numpy
(),
ref_input
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
0
,
rtol
=
1
)
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1
)
np
.
testing
.
assert_allclose
(
mlp
.
biases
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
bias
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_no_grad
(
self
):
mlp
=
MLP
(
mlp_sizes
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
])
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp
.
biases
[
i
].
data
.
copy_
(
linear
.
bias
)
mlp_layers
.
append
(
linear
)
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
)
ref_input
=
test_input
.
clone
().
detach
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_performance_half
(
self
):
def
test_performance_half
(
self
):
mlp
=
MLP
(
mlp_sizes
).
cuda
().
half
()
mlp
=
MLP
(
mlp_sizes
).
cuda
().
half
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment