Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
6e14df49
Commit
6e14df49
authored
May 08, 2020
by
rohithkrn
Browse files
Merge remote-tracking branch 'rocm_up/master' into apex_amp_bfp16
parents
c7fd532c
2d0f9cf2
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1438 additions
and
376 deletions
+1438
-376
apex/amp/scaler.py
apex/amp/scaler.py
+12
-3
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
+82
-31
apex/contrib/csrc/xentropy/xentropy_kernel.cu
apex/contrib/csrc/xentropy/xentropy_kernel.cu
+178
-67
apex/contrib/multihead_attn/self_multihead_attn_func.py
apex/contrib/multihead_attn/self_multihead_attn_func.py
+1
-1
apex/mlp/mlp.py
apex/mlp/mlp.py
+28
-19
csrc/layer_norm_cuda_kernel.cu
csrc/layer_norm_cuda_kernel.cu
+10
-4
csrc/mlp.cpp
csrc/mlp.cpp
+41
-16
csrc/mlp_cuda.cu
csrc/mlp_cuda.cu
+593
-100
csrc/multi_tensor_axpby_kernel.cu
csrc/multi_tensor_axpby_kernel.cu
+66
-27
csrc/multi_tensor_l2norm_kernel.cu
csrc/multi_tensor_l2norm_kernel.cu
+67
-12
csrc/multi_tensor_lamb.cu
csrc/multi_tensor_lamb.cu
+183
-70
csrc/multi_tensor_scale_kernel.cu
csrc/multi_tensor_scale_kernel.cu
+60
-25
setup.py
setup.py
+7
-1
tests/L0/run_mlp/test_mlp.py
tests/L0/run_mlp/test_mlp.py
+110
-0
No files found.
apex/amp/scaler.py
View file @
6e14df49
...
...
@@ -6,12 +6,18 @@ from itertools import product
def
scale_check_overflow_python
(
model_grad
,
master_grad
,
scale
,
check_overflow
=
False
):
# Exception handling for 18.04 compatibility
if
check_overflow
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
model_grad
.
is_sparse
:
cpu_sum
=
float
(
model_grad
.
float
().
_values
().
sum
())
else
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
return
True
if
master_grad
is
not
model_grad
:
# copy_ probably internally short-circuits this
master_grad
.
copy_
(
model_grad
)
if
model_grad
.
is_sparse
:
master_grad
.
copy_
(
model_grad
.
to_dense
())
else
:
master_grad
.
copy_
(
model_grad
)
if
scale
!=
1.0
:
master_grad
.
mul_
(
scale
)
return
False
...
...
@@ -19,7 +25,10 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
def
axpby_check_overflow_python
(
model_grad
,
stashed_grad
,
master_grad
,
a
,
b
,
check_overflow
=
False
):
# Exception handling for 18.04 compatibility
if
check_overflow
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
model_grad
.
is_sparse
:
cpu_sum
=
float
(
model_grad
.
float
().
_values
().
sum
())
else
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
return
True
...
...
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
View file @
6e14df49
...
...
@@ -14,6 +14,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
#include "type_shim.h"
typedef
enum
{
...
...
@@ -99,24 +110,64 @@ struct AdamFunctor
T
incoming_v
[
ILP
];
T
incoming_g
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
p
)
&&
is_aligned
(
m
)
&&
is_aligned
(
v
)
&&
is_aligned
(
g
)
&&
is_aligned
(
p_copy
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
GRAD_T
tmp_g
[
ILP
];
load_store
(
incoming_p
,
p
,
0
,
i_start
);
load_store
(
incoming_m
,
m
,
0
,
i_start
);
load_store
(
incoming_v
,
v
,
0
,
i_start
);
load_store
(
tmp_g
,
g
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
incoming_g
[
ii
]
=
static_cast
<
T
>
(
tmp_g
[
ii
]);
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
incoming_m
[
ii
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
incoming_v
[
ii
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
incoming_v
[
ii
]
+
eps
);
else
// Mode 1
denom
=
sqrtf
(
incoming_v
[
ii
])
+
eps
;
float
update
=
(
incoming_m
[
ii
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
incoming_p
[
ii
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
tmp_g
[
ii
]
=
static_cast
<
GRAD_T
>
(
incoming_p
[
ii
]);
}
load_store
(
p
,
incoming_p
,
i_start
,
0
);
load_store
(
m
,
incoming_m
,
i_start
,
0
);
load_store
(
v
,
incoming_v
,
i_start
,
0
);
if
(
DEPTH
==
5
)
load_store
(
p_copy
,
tmp_g
,
i_start
,
0
);
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
incoming_p
[
ii
]
=
0
;
incoming_m
[
ii
]
=
0
;
incoming_v
[
ii
]
=
0
;
incoming_g
[
ii
]
=
0
;
incoming_p
[
ii
]
=
0
;
incoming_m
[
ii
]
=
0
;
incoming_v
[
ii
]
=
0
;
incoming_g
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
incoming_p
[
ii
]
=
p
[
i
];
incoming_m
[
ii
]
=
m
[
i
];
incoming_v
[
ii
]
=
v
[
i
];
incoming_g
[
ii
]
=
static_cast
<
T
>
(
g
[
i
]);
}
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
incoming_p
[
ii
]
=
p
[
i
];
incoming_m
[
ii
]
=
m
[
i
];
incoming_v
[
ii
]
=
v
[
i
];
incoming_g
[
ii
]
=
static_cast
<
T
>
(
g
[
i
]);
}
}
// note for clarification to future michael:
...
...
@@ -124,24 +175,25 @@ struct AdamFunctor
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
j
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
int
j
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
j
<
n
&&
j
<
chunk_size
)
{
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
m
[
j
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
v
[
j
]
+
eps
);
else
// Mode 1
denom
=
sqrtf
(
v
[
j
])
+
eps
;
float
update
=
(
m
[
j
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
p
[
j
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
p_copy
[
j
]
=
(
GRAD_T
)
p
[
j
];
}
if
(
j
<
n
&&
j
<
chunk_size
)
{
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
m
[
j
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
v
[
j
]
+
eps
);
else
// Mode 1
denom
=
sqrtf
(
v
[
j
])
+
eps
;
float
update
=
(
m
[
j
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
p
[
j
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
p_copy
[
j
]
=
(
GRAD_T
)
p
[
j
];
}
}
}
}
}
};
...
...
@@ -332,4 +384,3 @@ void fused_adam_cuda_mt(
}
THCudaCheck
(
cudaGetLastError
());
}
apex/contrib/csrc/xentropy/xentropy_kernel.cu
View file @
6e14df49
/**
* From PyTorch:
*
*
* Copyright (c) 2016- Facebook, Inc (Adam Paszke)
* Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
...
...
@@ -10,54 +10,54 @@
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
*
*
* From Caffe2:
*
*
* Copyright (c) 2016-present, Facebook Inc. All rights reserved.
*
*
* All contributions by Facebook:
* Copyright (c) 2016 Facebook Inc.
*
*
* All contributions by Google:
* Copyright (c) 2015 Google Inc.
* All rights reserved.
*
*
* All contributions by Yangqing Jia:
* Copyright (c) 2015 Yangqing Jia
* All rights reserved.
*
*
* All contributions from Caffe:
* Copyright(c) 2013, 2014, 2015, the respective contributors
* All rights reserved.
*
*
* All other contributions:
* Copyright(c) 2015, 2016 the respective contributors
* All rights reserved.
*
*
* Caffe2 uses a copyright model similar to Caffe: each contributor holds
* copyright over their contributions to Caffe2. The project versioning records
* all such contribution and copyright details. If a contributor wants to further
* mark their specific copyright on a particular contribution, they should
* indicate their copyright solely in the commit message of the change when it is
* committed.
*
*
* All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
*
* 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
* and IDIAP Research Institute nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
...
...
@@ -70,7 +70,6 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
...
...
@@ -84,6 +83,8 @@
#include "type_shim.h"
#include "compat.h"
#define ALIGN_BYTES 16
using
Tensor
=
at
::
Tensor
;
using
TensorList
=
at
::
TensorList
;
using
ScalarType
=
at
::
ScalarType
;
...
...
@@ -123,7 +124,7 @@ const int max_threads = 1024;
inline
dim3
SoftMax_getBlockSize
(
int
ILP
,
uint64_t
dim_size
)
{
uint64_t
block_size
=
1
;
uint64_t
max_block_size
=
std
::
min
(
dim_size
/
ILP
,
static_cast
<
uint64_t
>
(
max_threads
));
while
(
block_size
<
max_block_size
)
block_size
*=
2
;
while
(
block_size
<
(
max_block_size
/
2
)
)
block_size
*=
2
;
// Launch at least a single warp - the kernel assumes that.
block_size
=
std
::
max
(
block_size
,
static_cast
<
uint64_t
>
(
32
));
return
dim3
(
block_size
);
...
...
@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
template
<
template
<
typename
,
typename
>
class
Reduction
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
AccumT
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
const
Reduction
<
T
,
AccumT
>&
r
,
AccumT
defaultVal
)
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal
=
defaultVal
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal
=
r
(
threadVal
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
T
tmp
[
ILP
];
T
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
threadVal
=
r
(
threadVal
,
tmp
[
j
]);
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
threadVal
=
r
(
threadVal
,
v
[
j
]);
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
threadVal
=
r
(
threadVal
,
data
[
offset
]);
...
...
@@ -319,7 +331,8 @@ ilpReduce(T* data,
template
<
template
<
typename
,
typename
>
class
Reduction1
,
template
<
typename
,
typename
>
class
Reduction2
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
void
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
AccumT
*
reducVal1
,
const
Reduction1
<
T
,
AccumT
>&
r1
,
...
...
@@ -328,27 +341,38 @@ ilpReduce(T* data,
const
Reduction2
<
T
,
AccumT
>&
r2
,
AccumT
defaultVal2
)
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal1
=
defaultVal1
;
AccumT
threadVal2
=
defaultVal2
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
threadVal2
=
r2
(
threadVal2
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
T
tmp
[
ILP
];
T
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
threadVal1
=
r1
(
threadVal1
,
tmp
[
j
]);
threadVal2
=
r2
(
threadVal2
,
tmp
[
j
]);
threadVal1
=
r1
(
threadVal1
,
v
[
j
]);
threadVal2
=
r2
(
threadVal2
,
v
[
j
]);
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
{
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
...
...
@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
// each block handles a sample in the mini-batch
input
+=
blockIdx
.
x
*
classes
;
//output += blockIdx.x * classes;
const
int
shift
=
((
uint64_t
)
input
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
int64_t
label
=
labels
[
blockIdx
.
x
];
// find the max and sum
accscalar_t
threadMax
,
threadSum
,
max_k
,
sum_k
;
ilpReduce
<
MaxFloat
,
AddFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
shift
,
input
,
classes
,
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
blockReduce
<
Max
,
Add
,
accscalar_t
>
(
sdata
,
&
max_k
,
threadMax
,
Max
<
accscalar_t
>
(),
...
...
@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
&
sum_k
,
threadSum
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
// reduce all values
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
shift
,
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
sumAll
=
blockReduce
<
Add
,
accscalar_t
>
(
sdata
,
threadExp
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
...
...
@@ -411,20 +435,16 @@ cunn_SoftMaxXEntropyForward(
}
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__global__
void
cunn_SoftMaxXEntropyBackward
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__device__
__forceinline__
void
apply
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
...
...
@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
int
offset
=
threadIdx
.
x
;
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
for
(;
offset
<
classes
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
accscalar_t
tmpLogits
[
ILP
];
...
...
@@ -444,22 +465,112 @@ cunn_SoftMaxXEntropyBackward(
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
gradInput
[
offset
+
j
*
blockDim
.
x
]
=
tmpGradOutput
*
(
std
::
exp
(
tmpLogits
[
j
]
-
coeff
)
-
static_cast
<
accscalar_t
>
(
(
offset
+
j
*
blockDim
.
x
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
std
::
exp
(
tmpLogits
[
j
]
-
coeff
)
-
static_cast
<
accscalar_t
>
(
(
offset
+
j
*
blockDim
.
x
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
((
offset
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__device__
__forceinline__
void
aligned_apply
(
int
shift
,
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
int64_t
label
=
labels
[
blockIdx
.
x
];
accscalar_t
coeff
=
max_log_sum_exp
[
blockIdx
.
x
];
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
logits
-=
shift
;
gradInput
-=
shift
;
classes
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
classes
-=
blockDim
.
x
;
gradInput
+=
blockDim
.
x
;
logits
+=
blockDim
.
x
;
shift
-=
blockDim
.
x
;
}
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
scalar_t
),
ILP
*
alignof
(
scalar_t
)
>::
type
LoadT
;
// input
scalar_t
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
// output
scalar_t
r
[
ILP
];
LoadT
*
result
=
reinterpret_cast
<
LoadT
*>
(
&
r
);
for
(;
offset
*
ILP
<
(
classes
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
logits
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
r
[
j
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
v
[
j
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
ILP
*
offset
+
j
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
reinterpret_cast
<
LoadT
*>
(
gradInput
)[
offset
]
=
*
result
;
}
offset
=
classes
-
last
+
threadIdx
.
x
;
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__global__
void
cunn_SoftMaxXEntropyBackward
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
// Do vectorized load/store when input/output have same alignment
const
int
shift
=
((
uint64_t
)
logits
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
const
int
shift_
=
((
uint64_t
)
gradInput
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
if
(
shift
==
shift_
){
aligned_apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
shift
,
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
else
{
apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
}
template
<
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
std
::
vector
<
Tensor
>
host_softmax_xentropy
(
...
...
@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
// XXX: it assumes that inner_size == 1
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
using
namespace
at
;
DISPATCH_FLOAT_AND_HALF
(
input
.
scalar_type
(),
0
,
"host_softmax_xentropy"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyForward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
2
*
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
...
@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
DISPATCH_FLOAT_AND_HALF
(
gI
.
scalar_type
(),
0
,
"host_softmax_xentropy_backward"
,
using
accscalar_t
=
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyBackward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
...
apex/contrib/multihead_attn/self_multihead_attn_func.py
View file @
6e14df49
...
...
@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
values_grads
=
torch
.
bmm
(
dropout_results
.
transpose
(
1
,
2
),
output_lin_grads
,
out
=
values_grads
.
transpose
(
0
,
1
))
# Mask and Scaling for Dropout (not a publically documented op)
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
dropout_prob_t
[
0
])
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
1.0
/
(
1.0
-
dropout_prob_t
[
0
])
)
# Softmax Grad (not a publically documented op)
softmax_grads
=
torch
.
_softmax_backward_data
(
dropout_grads
,
softmax_results
,
-
1
,
softmax_results
)
...
...
apex/mlp/mlp.py
View file @
6e14df49
...
...
@@ -7,17 +7,19 @@ from .. import amp
class
MlpFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
*
args
):
output
=
mlp_cuda
.
forward
(
args
)
def
forward
(
ctx
,
bias
,
activation
,
*
args
):
output
=
mlp_cuda
.
forward
(
bias
,
activation
,
args
)
ctx
.
save_for_backward
(
*
args
)
ctx
.
outputs
=
output
ctx
.
bias
=
bias
ctx
.
activation
=
activation
return
output
[
0
]
@
staticmethod
def
backward
(
ctx
,
grad_o
):
grads
=
mlp_cuda
.
backward
(
grad_o
,
ctx
.
outputs
,
ctx
.
saved_tensors
)
grads
=
mlp_cuda
.
backward
(
ctx
.
bias
,
ctx
.
activation
,
grad_o
,
ctx
.
outputs
,
ctx
.
saved_tensors
)
del
ctx
.
outputs
return
tuple
(
grads
)
return
(
None
,
None
,
*
grads
)
mlp_function
=
amp
.
half_function
(
MlpFunction
.
apply
)
...
...
@@ -29,16 +31,21 @@ class MLP(torch.nn.Module):
bias (bool): Default True:
relu (bool): Default True
"""
def
__init__
(
self
,
mlp_sizes
,
bias
=
True
,
relu
=
True
):
if
not
(
bias
and
relu
):
raise
TypeError
(
"bias and relu must be both true."
)
def
__init__
(
self
,
mlp_sizes
,
bias
=
True
,
activation
=
'relu'
):
super
(
MLP
,
self
).
__init__
()
self
.
num_layers
=
len
(
mlp_sizes
)
-
1
self
.
mlp_sizes
=
copy
(
mlp_sizes
)
self
.
bias
=
bias
self
.
relu
=
relu
self
.
bias
=
1
if
bias
else
0
if
activation
is
'none'
:
self
.
activation
=
0
elif
activation
is
'relu'
:
self
.
activation
=
1
elif
activation
is
'sigmoid'
:
self
.
activation
=
2
else
:
raise
TypeError
(
"activation must be relu or none."
)
# ignoring bias = False now
self
.
weights
=
[]
self
.
biases
=
[]
for
i
in
range
(
self
.
num_layers
):
...
...
@@ -46,10 +53,11 @@ class MLP(torch.nn.Module):
self
.
weights
.
append
(
w
)
name
=
'weight_{}'
.
format
(
i
)
setattr
(
self
,
name
,
w
)
b
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
]))
self
.
biases
.
append
(
b
)
name
=
'bias_{}'
.
format
(
i
)
setattr
(
self
,
name
,
b
)
if
self
.
bias
:
b
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
]))
self
.
biases
.
append
(
b
)
name
=
'bias_{}'
.
format
(
i
)
setattr
(
self
,
name
,
b
)
self
.
reset_parameters
()
...
...
@@ -58,13 +66,14 @@ class MLP(torch.nn.Module):
dimsum
=
weight
.
size
(
0
)
+
weight
.
size
(
1
)
std
=
math
.
sqrt
(
2.
/
float
(
dimsum
))
nn
.
init
.
normal_
(
weight
,
0.
,
std
)
for
bias
in
self
.
biases
:
std
=
math
.
sqrt
(
1.
/
float
(
bias
.
size
(
0
)))
nn
.
init
.
normal_
(
bias
,
0.
,
std
)
if
self
.
bias
:
for
bias
in
self
.
biases
:
std
=
math
.
sqrt
(
1.
/
float
(
bias
.
size
(
0
)))
nn
.
init
.
normal_
(
bias
,
0.
,
std
)
def
forward
(
self
,
input
):
return
mlp_function
(
input
,
*
self
.
weights
,
*
self
.
biases
)
return
mlp_function
(
self
.
bias
,
self
.
activation
,
input
,
*
self
.
weights
,
*
self
.
biases
)
def
extra_repr
(
self
):
s
=
F
"MLP sizes:
{
self
.
mlp_sizes
}
, Bias=
{
self
.
bias
}
,
ReLU=
{
self
.
relu
}
"
s
=
F
"MLP sizes:
{
self
.
mlp_sizes
}
, Bias=
{
self
.
bias
}
,
activation=
{
self
.
activation
}
"
return
s
csrc/layer_norm_cuda_kernel.cu
View file @
6e14df49
...
...
@@ -172,8 +172,8 @@ void cuWelfordMuSigma2(
for
(;
l
+
7
<
n2
;
l
+=
8
*
numx
)
{
for
(
int
k
=
0
;
k
<
8
;
k
+=
2
)
{
float2
curr
=
__half22float2
(
*
((
__half2
*
)(
lvals
+
l
+
k
)));
cuWelfordOnlineSum
(
curr
.
x
,
mu
,
sigma2
,
count
);
cuWelfordOnlineSum
(
curr
.
y
,
mu
,
sigma2
,
count
);
cuWelfordOnlineSum
<
float
>
(
curr
.
x
,
mu
,
sigma2
,
count
);
cuWelfordOnlineSum
<
float
>
(
curr
.
y
,
mu
,
sigma2
,
count
);
}
}
for
(;
l
<
n2
;
++
l
)
{
...
...
@@ -230,9 +230,15 @@ void cuWelfordMuSigma2(
template
<
typename
U
>
U
rsqrt
(
U
v
)
{
return
U
(
1
)
/
sqrt
(
v
);
}
#if defined __HIP_PLATFORM_HCC__
__device__
float
rsqrt
(
float
v
)
{
return
rsqrtf
(
v
);
}
#else
template
<
>
float
rsqrt
(
float
v
)
{
return
rsqrtf
(
v
);
}
#endif
template
<
>
double
rsqrt
(
double
v
)
{
return
rsqrt
(
v
);
}
...
...
@@ -293,7 +299,7 @@ void cuApplyLayerNorm(
// 1) blockDim.x == warpSize
// 2) Tensors are contiguous
//
for
(
auto
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
for
(
int
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
SharedMemory
<
U
>
shared
;
U
*
buf
=
shared
.
getPointer
();
U
mu
,
sigma2
;
...
...
@@ -531,7 +537,7 @@ void cuComputeGradInput(
const
T
*
gamma
,
T
*
grad_input
)
{
for
(
auto
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
for
(
int
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
U
sum_loss1
=
U
(
0
);
U
sum_loss2
=
U
(
0
);
const
U
c_mean
=
mean
[
i1
];
...
...
csrc/mlp.cpp
View file @
6e14df49
...
...
@@ -19,7 +19,9 @@ int mlp_fp(
int
*
output_features
,
T
**
BPtr
,
T
*
Y
,
T
*
reserved_space
);
T
*
reserved_space
,
int
use_bias
,
int
activation
);
template
<
typename
T
>
int
mlp_bp
(
...
...
@@ -35,11 +37,18 @@ int mlp_bp(
T
*
work_space
,
T
*
dX
,
T
**
dwPtr
,
T
**
dbPtr
);
T
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
std
::
vector
<
at
::
Tensor
>
mlp_forward
(
int
use_bias
,
int
activation
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
std
::
vector
<
at
::
Tensor
>
mlp_forward
(
std
::
vector
<
at
::
Tensor
>
inputs
)
{
// inputs contains (input, weights, biases)
auto
num_layers
=
(
inputs
.
size
()
-
1
)
/
2
;
auto
num_layers
=
inputs
.
size
()
-
1
;
if
(
use_bias
)
{
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
...
...
@@ -60,7 +69,9 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
std
::
vector
<
scalar_t
*>
b_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
if
(
use_bias
)
{
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
}
}
auto
result
=
mlp_fp
<
scalar_t
>
(
inputs
[
0
].
data_ptr
<
scalar_t
>
(),
...
...
@@ -71,37 +82,48 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
output_features
.
data
(),
b_ptr
.
data
(),
out
.
data_ptr
<
scalar_t
>
(),
reserved_space
.
data_ptr
<
scalar_t
>
());
reserved_space
.
data_ptr
<
scalar_t
>
(),
use_bias
,
activation
);
});
return
{
out
,
reserved_space
};
}
std
::
vector
<
at
::
Tensor
>
mlp_backward
(
at
::
Tensor
grad_o
,
std
::
vector
<
at
::
Tensor
>
fprop_outputs
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
// same code to get sizes and W pointers
auto
num_layers
=
(
inputs
.
size
()
-
1
)
/
2
;
int
use_bias
,
int
activation
,
at
::
Tensor
grad_o
,
std
::
vector
<
at
::
Tensor
>
fprop_outputs
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
auto
num_layers
=
inputs
.
size
()
-
1
;
if
(
use_bias
)
{
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
// TODO: not creating empty tensor for it?
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
vector
<
int
>
output_features
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
output_features
.
push_back
(
inputs
[
i
+
1
].
size
(
0
));
}
// create outputs, length of inputs
// TODO: not create bias if not needed
std
::
vector
<
at
::
Tensor
>
outputs
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
outputs
.
push_back
(
at
::
empty
(
inputs
[
i
].
sizes
(),
inputs
[
i
].
type
()));
// clone for testing now
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_
for
ward"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_
back
ward"
,
[
&
]
{
std
::
vector
<
scalar_t
*>
w_ptr
;
std
::
vector
<
scalar_t
*>
b_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
}
std
::
vector
<
scalar_t
*>
outputs_ptr
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
...
...
@@ -127,7 +149,10 @@ std::vector<at::Tensor> mlp_backward(
work_space
.
data_ptr
<
scalar_t
>
(),
outputs_ptr
[
0
],
outputs_ptr
.
data
()
+
1
,
outputs_ptr
.
data
()
+
1
+
num_layers
);
outputs_ptr
.
data
()
+
1
+
num_layers
,
requires_grad
,
use_bias
,
activation
);
});
return
outputs
;
...
...
csrc/mlp_cuda.cu
View file @
6e14df49
...
...
@@ -10,8 +10,11 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>
#define BIASADDRELU_FPROP_NUM_THREADS 128
#define BIASADDRELU_BPROP_NUM_THREADS 128
// constants for fused bias+relu kernel
#define BIAS_RELU_FW_NTHREADS 128 // forward number of thread per block
#define BIAS_RELU_BW_NTHREADS_X 32 // backward number of thread in feature dim
#define BIAS_RELU_BW_NTHREADS_Y 16 // backward number of thread in batch dim
#define BIAS_RELU_RED_PER_THREAD 16 // backward minimal reduction length per thread
// move to a header later on
#define ILP 4
...
...
@@ -42,6 +45,12 @@ __device__ __inline__ float relu(float a) {
return
(
retf
);
}
// Keep Sigmoid in float only. When using half, cast to float before calling.
__device__
__inline__
float
sigmoid
(
float
a
)
{
float
retf
=
1.
f
/
(
1.
f
+
expf
(
-
a
));
return
(
retf
);
}
// FP64 Wrapper around cublas GEMMEx
cublasStatus_t
mlp_gemm
(
cublasHandle_t
handle
,
...
...
@@ -156,9 +165,55 @@ cublasStatus_t mlp_gemm(
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
}
// Bias ADD
+ ReLU
. Assume input X is [features x batch size],
assume
column major.
// Bias ADD. Assume input X is [features x batch size], column major.
// Bias is one 'features' long vector, with implicit broadcast.
// Currently, activation support fuesed ReLU. Safe to call in-place.
template
<
typename
T
>
__global__
void
biasAdd_fprop
(
T
*
X
,
T
*
b
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
T
r_b
[
ILP
];
if
(
is_aligned
(
X
)
&&
is_aligned
(
b
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
row
=
tid
%
(
features
/
ILP
);
load_store
(
r_x
,
X
,
0
,
tid
);
load_store
(
r_b
,
b
,
0
,
row
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
bias_sum
;
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
int
row
=
tid
%
features
;
r_x
[
ii
]
=
X
[
idx
];
r_b
[
ii
]
=
b
[
row
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
bias_sum
=
static_cast
<
float
>
(
r_x
[
ii
])
+
static_cast
<
float
>
(
r_b
[
ii
]);
r_x
[
ii
]
=
bias_sum
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// Bias ADD + ReLU. Assume input X is [features x batch size], column major.
// Activation support fuesed ReLU. Safe to call in-place.
template
<
typename
T
>
__global__
void
biasAddRelu_fprop
(
T
*
X
,
T
*
b
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
...
...
@@ -204,32 +259,308 @@ __global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
}
}
// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Relu_fprop
(
T
*
X
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
if
(
is_aligned
(
X
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_x
,
X
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
relu
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_x
[
ii
]
=
X
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
relu
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Sigmoid_fprop
(
T
*
X
,
uint
batch_size
,
uint
features
)
{
T
r_x
[
ILP
];
if
(
is_aligned
(
X
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_x
,
X
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
sigmoid
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
load_store
(
X
,
r_x
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_x
[
ii
]
=
X
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_x
[
ii
]
=
sigmoid
(
static_cast
<
float
>
(
r_x
[
ii
]));
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
X
[
idx
]
=
r_x
[
ii
];
}
}
}
}
}
// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Relu_bprop
(
T
*
dY
,
T
*
Y
,
uint
batch_size
,
uint
features
,
T
*
dX
)
{
T
r_dy
[
ILP
];
T
r_y
[
ILP
];
if
(
is_aligned
(
dY
)
&&
is_aligned
(
Y
)
&&
is_aligned
(
dX
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_dy
,
dY
,
0
,
tid
);
load_store
(
r_y
,
Y
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
}
load_store
(
dX
,
r_dy
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_dy
[
ii
]
=
dY
[
idx
];
r_y
[
ii
]
=
Y
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
((
float
)
r_y
[
ii
]
<=
0.
f
)
r_dy
[
ii
]
=
0
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
dX
[
idx
]
=
r_dy
[
ii
];
}
}
}
}
}
// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template
<
typename
T
>
__global__
void
Sigmoid_bprop
(
T
*
dY
,
T
*
Y
,
uint
batch_size
,
uint
features
,
T
*
dX
)
{
T
r_dy
[
ILP
];
T
r_y
[
ILP
];
if
(
is_aligned
(
dY
)
&&
is_aligned
(
Y
)
&&
is_aligned
(
dX
)
&&
features
%
ILP
==
0
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
*
ILP
<
features
*
batch_size
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
load_store
(
r_dy
,
dY
,
0
,
tid
);
load_store
(
r_y
,
Y
,
0
,
tid
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
float
grad_out
=
r_dy
[
ii
];
float
out
=
r_y
[
ii
];
float
grad_i
=
out
*
(
1.
f
-
out
)
*
grad_out
;
r_dy
[
ii
]
=
grad_i
;
}
load_store
(
dX
,
r_dy
,
tid
,
0
);
}
}
else
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid
<
features
*
batch_size
;
tid
+=
ILP
*
blockDim
.
x
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
r_dy
[
ii
]
=
dY
[
idx
];
r_y
[
ii
]
=
Y
[
idx
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
grad_out
=
r_dy
[
ii
];
float
out
=
r_y
[
ii
];
float
grad_i
=
out
*
(
1.
f
-
out
)
*
grad_out
;
r_dy
[
ii
]
=
grad_i
;
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
idx
=
tid
+
ii
*
blockDim
.
x
*
gridDim
.
x
;
if
(
idx
<
features
*
batch_size
)
{
dX
[
idx
]
=
r_dy
[
ii
];
}
}
}
}
}
// Compute grid size for pointwise backward kernel.
//
Some intelligence needed to determine number of splits for reduction.
//
block_x/y is total elment being handled per block, not number of threads
void
get_biasAddRelu_bprop_grid_size
(
int
yfeat
,
int
threadsPerBlock
,
int
batch_size
,
int
block_x
,
int
block_y
,
int
*
grid_x
,
int
*
grid_y
)
{
*
grid_x
=
(
yfeat
+
block_x
-
1
)
/
block_x
;
// Get number of SMs for efficient reduction.
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
// First preference, whole reduction in 1 CTA
int
nBlocks
=
(
yfeat
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
// Figure out how many splits to divide reduction into. At least 32 elements per CTA.
// we want grid_y as close to sqrt(batchsize)?
int
nRedSplits
=
std
::
sqrt
(
batch_size
);
// for batchsize <=64, just use 1 block
if
(
batch_size
<
64
)
nRedSplits
=
1
;
// no need to go over occupancy
nRedSplits
=
min
((
8
*
num_SMs
)
/
nBlocks
,
nRedSplits
);
*
grid_x
=
nBlocks
;
*
grid_y
=
nRedSplits
;
// can switch to occupancy calculation. use 4 below now for sm_70
int
max_blocks_y
=
num_SMs
*
4
/
(
*
grid_x
);
// block_y should be from minimal work per thread
int
nRedSplits
=
(
batch_size
+
block_y
-
1
)
/
block_y
;
// increase number of elem per thread redcution to not launch more than enough
// kernel adjust work, so here we just launch max block
*
grid_y
=
std
::
min
(
nRedSplits
,
max_blocks_y
);
return
;
}
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template
<
typename
T
,
int
UNROLL_FACTOR
>
__global__
void
biasAdd_bprop
(
T
*
dY
,
int
features
,
int
batch_size
,
volatile
float
*
intermediate
,
int
*
semaphores
,
T
*
db
)
{
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the span this thread is responsible for
// For this block
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
__shared__
bool
isLastBlock
;
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
];
// Accumulate db in FP32 always
float
db_local
=
0
;
if
(
f
<
features
)
{
int
nidx
=
0
;
// Handle non-multiple of UNROLL_FACTOR residue
for
(;
nidx
<
nSpan
%
UNROLL_FACTOR
;
nidx
++
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
db_local
+=
(
float
)
dY
[
flat_idx
];
}
// Handle meat of work
for
(;
(
nidx
+
UNROLL_FACTOR
-
1
)
<
nSpan
;
nidx
+=
UNROLL_FACTOR
)
{
int
row
,
col
,
flat_idx
;
row
=
f
;
col
=
nStart
+
nidx
;
flat_idx
=
col
*
features
+
row
;
#pragma unroll 4
for
(
int
u
=
0
;
u
<
UNROLL_FACTOR
;
u
++
)
{
db_local
+=
(
float
)
dY
[
flat_idx
];
flat_idx
+=
features
;
}
}
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
smem
[
linear_idx
]
=
db_local
;
}
__syncthreads
();
if
(
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
db_local
+=
smem
[
yidx
*
blockDim
.
x
+
threadIdx
.
x
];
}
// block result is in db_local now for all threadIdx.y == 0
// Write out partial result
out
[
f
]
=
db_local
;
}
}
__threadfence
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in the grid_y dimension.
// Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
&&
f
<
features
)
{
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
}
__syncthreads
();
db_local
=
0
;
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
&&
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
}
db
[
f
]
=
(
T
)
db_local
;
}
}
}
// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template
<
typename
T
,
int
UNROLL_FACTOR
>
...
...
@@ -245,14 +576,22 @@ __global__ void biasAddRelu_bprop(
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the batch span this thread is responsible for
int
chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
nStart
=
blockIdx
.
y
*
chunkSize
;
int
nSpan
=
min
(
batch_size
,
nStart
+
chunkSize
)
-
nStart
;
// Compute the span this thread is responsible for
// For this block
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
__shared__
bool
isLastBlock
;
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
];
// Accumulate db in FP32 always
float
db_local
=
0
;
...
...
@@ -296,15 +635,28 @@ __global__ void biasAddRelu_bprop(
}
}
// Write out partial result
out
[
f
]
=
db_local
;
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
smem
[
linear_idx
]
=
db_local
;
}
__syncthreads
();
if
(
f
<
features
)
{
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
db_local
+=
smem
[
yidx
*
blockDim
.
x
+
threadIdx
.
x
];
}
// block result is in db_local now for all threadIdx.y == 0
// Write out partial result
out
[
f
]
=
db_local
;
}
}
__threadfence
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in
//
the grid_y dimension.
if
(
threadIdx
.
x
==
0
&&
f
<
features
)
{
// Increment semaphore and check if this is the last CTA in
the grid_y dimension.
//
Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
&&
f
<
features
)
{
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
...
...
@@ -312,14 +664,17 @@ __global__ void biasAddRelu_bprop(
__syncthreads
();
db_local
=
0
;
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
&&
f
<
features
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
if
(
threadIdx
.
y
==
0
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
db_local
+=
(
float
)(
intermediate
[
col
*
features
+
row
]);
}
db
[
f
]
=
(
T
)
db_local
;
}
db
[
f
]
=
(
T
)
db_local
;
}
}
...
...
@@ -338,10 +693,16 @@ __global__ void biasAddRelu_bprop_aligned(
// The feature that this thread is responsible for
int
f
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// Compute the batch span this thread is responsible for
int
chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
nStart
=
blockIdx
.
y
*
chunkSize
;
int
nSpan
=
min
(
batch_size
,
nStart
+
chunkSize
)
-
nStart
;
// Compute the span this thread is responsible for
// For this block
int
b_chunkSize
=
(
batch_size
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
int
b_nStart
=
blockIdx
.
y
*
b_chunkSize
;
int
b_nSpan
=
min
(
batch_size
,
b_nStart
+
b_chunkSize
)
-
b_nStart
;
// For this thread
int
chunkSize
=
(
b_chunkSize
+
blockDim
.
y
-
1
)
/
blockDim
.
y
;
int
nStart
=
threadIdx
.
y
*
chunkSize
+
b_nStart
;
int
nSpan
=
min
(
b_nStart
+
b_nSpan
,
nStart
+
chunkSize
)
-
nStart
;
volatile
float
*
out
=
intermediate
+
blockIdx
.
y
*
features
;
// Flag to trigger last reduction.
...
...
@@ -399,24 +760,45 @@ __global__ void biasAddRelu_bprop_aligned(
}
}
if
(
gridDim
.
y
==
1
)
{
// we know block size for now
__shared__
float
smem
[
BIAS_RELU_BW_NTHREADS_X
*
BIAS_RELU_BW_NTHREADS_Y
*
ILP
];
// naive block reduction on y-dim
int
linear_idx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
float
*
smem_out
=
smem
+
ILP
*
linear_idx
;
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
return
;
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
smem_out
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
for
(
int
yidx
=
1
;
yidx
<
blockDim
.
y
;
yidx
++
){
float
*
smem_in
=
smem
+
ILP
*
(
yidx
*
blockDim
.
x
+
threadIdx
.
x
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
+=
smem_in
[
ii
];
// reuse local dy buffer
}
}
// Write out partial result
load_store
(
out
,
db_local
,
f
,
0
);
// block result is in db_local now for all threadIdx.y == 0
// TODO: maybe not useful early exit here
if
(
gridDim
.
y
==
1
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
return
;
}
// Write out partial result
load_store
(
out
,
db_local
,
f
,
0
);
}
__threadfence
();
__syncthreads
();
// Increment semaphore and check if this is the last CTA in
//
the grid_y dimension.
if
(
threadIdx
.
x
==
0
)
{
// Increment semaphore and check if this is the last CTA in
the grid_y dimension.
//
Only thread (0,0) calls this
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
unsigned
int
sum_idx
;
sum_idx
=
atomicAdd
(
&
(
semaphores
[
blockIdx
.
x
]),
1
);
isLastBlock
=
(
sum_idx
==
(
gridDim
.
y
-
1
));
...
...
@@ -428,22 +810,26 @@ __global__ void biasAddRelu_bprop_aligned(
db_local
[
ii
]
=
0.
f
;
}
float
r_db
[
ILP
];
// No block reduction for now, only thread (*,0) do grid reduction
if
(
isLastBlock
)
{
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
load_store
(
r_db
,
intermediate
,
0
,
col
*
features
/
ILP
+
row
);
if
(
threadIdx
.
y
==
0
){
for
(
int
n
=
0
;
n
<
gridDim
.
y
;
n
++
)
{
int
row
,
col
;
row
=
f
;
col
=
n
;
load_store
(
r_db
,
intermediate
,
0
,
col
*
features
/
ILP
+
row
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
+=
r_db
[
ii
];
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
db_local
[
ii
]
+=
r_db
[
ii
];
}
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
){
r_dy
[
ii
]
=
db_local
[
ii
];
// reuse local dy buffer
}
load_store
(
db
,
r_dy
,
f
,
0
);
}
load_store
(
db
,
r_dy
,
f
,
0
);
}
}
...
...
@@ -502,10 +888,20 @@ size_t get_reduction_scratch_space(int batch_size, int num_layers, const int* ou
size_t
max_scratch_space
=
0
;
// Loop over all layers to see which one needs the max scratch space
for
(
int
l
=
0
;
l
<
num_layers
;
l
++
)
{
int
tmp
,
num_splits
;
// need to find max(aligned, not_aligned)
int
tmp
,
res0
,
res1
;
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
output_features
[
l
],
batch_size
,
block_x
,
block_y
,
&
tmp
,
&
res0
);
block_x
=
ILP
*
BIAS_RELU_BW_NTHREADS_X
;
get_biasAddRelu_bprop_grid_size
(
output_features
[
l
],
BIASADDRELU_BPROP_NUM_THREADS
,
batch_size
,
&
tmp
,
&
num_splits
);
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
num_splits
));
output_features
[
l
],
batch_size
,
block_x
,
block_y
,
&
tmp
,
&
res1
);
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
res0
));
max_scratch_space
=
std
::
max
(
max_scratch_space
,
(
size_t
)(
output_features
[
l
]
*
res1
));
}
return
max_scratch_space
;
...
...
@@ -581,7 +977,9 @@ int mlp_fp(
int
*
output_features
,
T
**
BPtr
,
T
*
Y
,
T
*
reserved_space
)
{
T
*
reserved_space
,
int
use_bias
,
int
activation
)
{
T
*
weight
,
*
input
,
*
output
,
*
bias
;
T
*
reserved_space_x
,
*
reserved_space_y
;
reserved_space_x
=
NULL
;
...
...
@@ -597,7 +995,9 @@ int mlp_fp(
weight
=
WPtr
[
layer
];
input
=
(
layer
==
0
)
?
X
:
reserved_space_x
;
output
=
(
layer
==
num_layers
-
1
)
?
Y
:
reserved_space_y
;
bias
=
BPtr
[
layer
];
if
(
use_bias
)
{
bias
=
BPtr
[
layer
];
}
int
ifeat
=
(
layer
==
0
)
?
input_features
:
output_features
[
layer
-
1
];
int
ofeat
=
output_features
[
layer
];
...
...
@@ -627,12 +1027,33 @@ int mlp_fp(
return
1
;
}
// Call biasReLU
const
uint
&
input_size
=
ofeat
;
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAddRelu_fprop
<
T
>
,
BIASADDRELU_FPROP_NUM_THREADS
,
0
);
biasAddRelu_fprop
<<<
num_SMs
*
num_blocks
,
BIASADDRELU_FPROP_NUM_THREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
// Call biasReLU
if
(
use_bias
==
1
)
{
if
(
activation
==
0
)
{
// no activation
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAdd_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAdd_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
}
else
if
(
activation
==
1
)
{
// relu
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAddRelu_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAddRelu_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
}
else
if
(
activation
==
2
)
{
// sigmoid
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
biasAdd_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
biasAdd_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
bias
,
batch_size
,
input_size
);
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
}
else
{
// don't need to do anything in case of no activation and no bias
if
(
activation
==
1
)
{
// relu
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Relu_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Relu_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
else
if
(
activation
==
2
)
{
// sigmoid
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_fprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_fprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
output
,
batch_size
,
input_size
);
}
}
// Set current output as next layer input
reserved_space_x
=
reserved_space_y
;
...
...
@@ -660,7 +1081,10 @@ int mlp_bp(
T
*
work_space
,
T
*
dX
,
T
**
dwPtr
,
T
**
dbPtr
)
{
T
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
)
{
T
*
weight
;
T
*
dweight
,
*
dx
,
*
dy
,
*
dbias
;
T
*
x
,
*
y
;
...
...
@@ -719,32 +1143,85 @@ int mlp_bp(
float
one
=
1.
f
;
float
zero
=
0.
f
;
// Call bias ReLU backprop - first implementation, 1 thread per bias element
int
threadsPerBlock
=
BIASADDRELU_BPROP_NUM_THREADS
;
int
grid_x
,
grid_y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
threadsPerBlock
,
batch_size
,
&
grid_x
,
&
grid_y
);
dim3
block
(
threadsPerBlock
);
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
if
(
yfeat
%
(
ILP
*
threadsPerBlock
)
==
0
&&
is_aligned
(
y
)
&&
is_aligned
(
dy
)
&&
is_aligned
(
dy_gemm
)
&&
is_aligned
(
dbias
)){
dim3
grid
(
grid_x
/
ILP
,
grid_y
);
biasAddRelu_bprop_aligned
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
}
else
{
dim3
grid
(
grid_x
,
grid_y
);
biasAddRelu_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
if
(
use_bias
==
1
)
{
if
(
activation
==
0
)
{
// no acitvation
// bgrad
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAdd_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
dy
,
yfeat
,
batch_size
,
db_scratch
,
semaphores
,
dbias
);
// bypass dgrad through reset pointer
dy_gemm
=
dy
;
}
else
if
(
activation
==
1
)
{
// relu
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
if
(
yfeat
%
(
ILP
*
BIAS_RELU_BW_NTHREADS_X
)
==
0
&&
is_aligned
(
y
)
&&
is_aligned
(
dy
)
&&
is_aligned
(
dy_gemm
)
&&
is_aligned
(
dbias
)){
int
block_x
=
ILP
*
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAddRelu_bprop_aligned
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
}
else
{
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAddRelu_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
y
,
dy
,
yfeat
,
batch_size
,
dy_gemm
,
db_scratch
,
semaphores
,
dbias
);
}
}
else
if
(
activation
==
2
)
{
// sigmoid
// activation backward
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
// bgrad, from dy_gemm
dim3
block
(
BIAS_RELU_BW_NTHREADS_X
,
BIAS_RELU_BW_NTHREADS_Y
);
int
grid_x
,
grid_y
;
cudaMemsetAsync
(
semaphores
,
0
,
semaphore_size
,
stream
);
int
block_x
=
BIAS_RELU_BW_NTHREADS_X
;
int
block_y
=
BIAS_RELU_RED_PER_THREAD
*
BIAS_RELU_BW_NTHREADS_Y
;
get_biasAddRelu_bprop_grid_size
(
yfeat
,
batch_size
,
block_x
,
block_y
,
&
grid_x
,
&
grid_y
);
dim3
grid
(
grid_x
,
grid_y
);
biasAdd_bprop
<
T
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
dy_gemm
,
yfeat
,
batch_size
,
db_scratch
,
semaphores
,
dbias
);
}
}
else
{
// no bias below
if
(
activation
==
0
)
{
// bypass dgrad through reset pointer
dy_gemm
=
dy
;
}
else
if
(
activation
==
1
)
{
// relu
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Relu_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Relu_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
}
else
if
(
activation
==
2
)
{
// sigmoid
int
num_blocks
=
0
;
int
num_SMs
=
at
::
cuda
::
getCurrentDeviceProperties
()
->
multiProcessorCount
;
cudaOccupancyMaxActiveBlocksPerMultiprocessor
(
&
num_blocks
,
Sigmoid_bprop
<
T
>
,
BIAS_RELU_FW_NTHREADS
,
0
);
Sigmoid_bprop
<<<
num_SMs
*
num_blocks
,
BIAS_RELU_FW_NTHREADS
,
0
,
stream
>>>
(
dy
,
y
,
batch_size
,
yfeat
,
dy_gemm
);
}
}
cublasStatus_t
cublas_status
;
// Call GEMM dgrad
cublas_status
=
mlp_gemm
(
if
(
layer
>
0
||
requires_grad
==
1
)
{
cublas_status
=
mlp_gemm
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
...
...
@@ -760,9 +1237,10 @@ int mlp_bp(
dx
,
xfeat
);
if
(
cublas_status
!=
CUBLAS_STATUS_SUCCESS
)
{
printf
(
"GEMM dgrad failed with %d
\n
"
,
cublas_status
);
return
1
;
if
(
cublas_status
!=
CUBLAS_STATUS_SUCCESS
)
{
printf
(
"GEMM dgrad failed with %d
\n
"
,
cublas_status
);
return
1
;
}
}
// Call GEMM wgrad
...
...
@@ -801,7 +1279,9 @@ template int mlp_fp<float>(
int
*
output_features
,
float
**
BPtr
,
float
*
Y
,
float
*
reserved_space
);
float
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
float
>(
float
*
X
,
...
...
@@ -816,7 +1296,10 @@ template int mlp_bp<float>(
float
*
work_space
,
float
*
dX
,
float
**
dwPtr
,
float
**
dbPtr
);
float
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
int
mlp_fp
<
at
::
Half
>(
at
::
Half
*
X
,
...
...
@@ -827,7 +1310,9 @@ template int mlp_fp<at::Half>(
int
*
output_features
,
at
::
Half
**
BPtr
,
at
::
Half
*
Y
,
at
::
Half
*
reserved_space
);
at
::
Half
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
at
::
Half
>(
at
::
Half
*
X
,
...
...
@@ -842,7 +1327,10 @@ template int mlp_bp<at::Half>(
at
::
Half
*
work_space
,
at
::
Half
*
dX
,
at
::
Half
**
dwPtr
,
at
::
Half
**
dbPtr
);
at
::
Half
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
int
mlp_fp
<
double
>(
double
*
X
,
...
...
@@ -853,7 +1341,9 @@ template int mlp_fp<double>(
int
*
output_features
,
double
**
BPtr
,
double
*
Y
,
double
*
reserved_space
);
double
*
reserved_space
,
int
use_bias
,
int
activation
);
template
int
mlp_bp
<
double
>(
double
*
X
,
...
...
@@ -868,7 +1358,10 @@ template int mlp_bp<double>(
double
*
work_space
,
double
*
dX
,
double
**
dwPtr
,
double
**
dbPtr
);
double
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
template
size_t
get_mlp_bp_workspace_in_bytes
<
float
>(
int
batch_size
,
...
...
csrc/multi_tensor_axpby_kernel.cu
View file @
6e14df49
...
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
,
typename
y_t
,
typename
out_t
>
struct
AxpbyFunctor
{
...
...
@@ -43,46 +54,74 @@ struct AxpbyFunctor
n
-=
chunk_idx
*
chunk_size
;
// Non-divergent exit condition for __syncthreads, not necessary here
float
xs
[
ILP
];
float
ys
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
bool
finite
=
true
;
x_t
r_x
[
ILP
];
y_t
r_y
[
ILP
];
out_t
r_out
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
)
&&
is_aligned
(
y
)
&&
is_aligned
(
out
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
xs
[
ii
]
=
0
;
ys
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
load_store
(
r_y
,
y
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
xs
[
ii
]
=
static_cast
<
float
>
(
x
[
i
]);
ys
[
ii
]
=
static_cast
<
float
>
(
y
[
i
]);
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
if
(
arg_to_check
==
-
1
)
finite
=
finite
&&
(
isfinite
(
r_x
[
ii
])
&&
isfinite
(
r_y
[
ii
]));
if
(
arg_to_check
==
0
)
finite
=
finite
&&
isfinite
(
r_x
[
ii
]);
if
(
arg_to_check
==
1
)
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
}
// store
load_store
(
out
,
r_out
,
i_start
,
0
);
}
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
// Non-divergent exit condition for __syncthreads, not necessary here
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
out
[
i
]
=
static_cast
<
out_t
>
(
a
*
xs
[
ii
]
+
b
*
ys
[
ii
]);
bool
finite
=
true
;
r_x
[
ii
]
=
0
;
r_y
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_x
[
ii
]
=
x
[
i
];
r_y
[
ii
]
=
y
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
if
(
arg_to_check
==
-
1
)
finite
=
(
isfinite
(
x
s
[
ii
])
&&
isfinite
(
y
s
[
ii
]));
finite
=
finite
&&
(
isfinite
(
r_
x
[
ii
])
&&
isfinite
(
r_
y
[
ii
]));
if
(
arg_to_check
==
0
)
finite
=
isfinite
(
x
s
[
ii
]);
finite
=
finite
&&
isfinite
(
r_
x
[
ii
]);
if
(
arg_to_check
==
1
)
finite
=
isfinite
(
ys
[
ii
]);
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
}
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
}
}
}
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
};
...
...
csrc/multi_tensor_l2norm_kernel.cu
View file @
6e14df49
...
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
struct
L2NormFunctor
{
...
...
@@ -41,22 +52,44 @@ struct L2NormFunctor
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
float
next
=
static_cast
<
float
>
(
r_
x
[
i
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
...
...
@@ -104,22 +137,44 @@ struct MaxNormFunctor
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
float
next
=
static_cast
<
float
>
(
r_
x
[
i
i
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
...
...
csrc/multi_tensor_lamb.cu
View file @
6e14df49
...
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
typedef
enum
{
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
...
...
@@ -68,71 +79,149 @@ struct LAMBStage1Functor
n
-=
chunk_idx
*
chunk_size
;
// see note in multi_tensor_scale_kernel.cu
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
g
)
&&
is_aligned
(
p
)
&&
is_aligned
(
m
)
&&
is_aligned
(
v
))
{
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
T
l_g
[
ILP
];
T
l_p
[
ILP
];
T
l_m
[
ILP
];
T
l_v
[
ILP
];
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
l_g
,
g
,
0
,
i_start
);
if
(
decay
!=
0
)
load_store
(
l_p
,
p
,
0
,
i_start
);
load_store
(
l_m
,
m
,
0
,
i_start
);
load_store
(
l_v
,
v
,
0
,
i_start
);
// unpack
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_g
[
ii
]
=
g
[
i
];
// special ?optimization? for lamb stage 1
r_g
[
ii
]
=
l_g
[
ii
];
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
p
[
i
];
r_p
[
ii
]
=
l_
p
[
i
i
];
}
r_m
[
ii
]
=
m
[
i
];
r_v
[
ii
]
=
v
[
i
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
l_m
[
ii
];
r_v
[
ii
]
=
l_v
[
ii
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
l_p
[
ii
]
=
r_p
[
ii
];
l_m
[
ii
]
=
r_m
[
ii
];
l_v
[
ii
]
=
r_v
[
ii
];
}
// store
load_store
(
g
,
l_p
,
i_start
,
0
);
load_store
(
m
,
l_m
,
i_start
,
0
);
load_store
(
v
,
l_v
,
i_start
,
0
);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
// see note in multi_tensor_scale_kernel.cu
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
g
[
i
]
=
r_p
[
ii
];
m
[
i
]
=
r_m
[
ii
];
v
[
i
]
=
r_v
[
ii
];
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_g
[
ii
]
=
g
[
i
];
// special ?optimization? for lamb stage 1
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
p
[
i
];
}
r_m
[
ii
]
=
m
[
i
];
r_v
[
ii
]
=
v
[
i
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
g
[
i
]
=
r_p
[
ii
];
m
[
i
]
=
r_m
[
ii
];
v
[
i
]
=
r_v
[
ii
];
}
}
}
}
...
...
@@ -173,34 +262,58 @@ struct LAMBStage2Functor
n
-=
chunk_idx
*
chunk_size
;
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
p
)
&&
is_aligned
(
update
))
{
MATH_T
r_p
[
ILP
];
MATH_T
r_update
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
T
r_p
[
ILP
];
T
r_update
[
ILP
];
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_p
,
p
,
0
,
i_start
);
load_store
(
r_update
,
update
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
p
[
i
];
r_update
[
ii
]
=
update
[
i
];
r_p
[
ii
]
=
static_cast
<
MATH_T
>
(
r_p
[
ii
])
-
(
ratio
*
static_cast
<
MATH_T
>
(
r_update
[
ii
]));
}
load_store
(
p
,
r_p
,
i_start
,
0
);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
])
;
}
MATH_T
r_p
[
ILP
]
;
MATH_T
r_update
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_p
[
ii
]
=
p
[
i
];
r_update
[
ii
]
=
update
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
p
[
i
]
=
r_p
[
ii
];
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
p
[
i
]
=
r_p
[
ii
];
}
}
}
}
...
...
csrc/multi_tensor_scale_kernel.cu
View file @
6e14df49
...
...
@@ -15,6 +15,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
in_t
,
typename
out_t
>
struct
ScaleFunctor
{
...
...
@@ -34,44 +45,68 @@ struct ScaleFunctor
in_t
*
in
=
(
in_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
in
+=
chunk_idx
*
chunk_size
;
out_t
*
out
=
(
out_t
*
)
tl
.
addresses
[
1
][
tensor_loc
];
out
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
// Non-divergent exit condition for __syncthreads, not necessary here
float
incoming_vals
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
bool
finite
=
true
;
in_t
r_in
[
ILP
];
out_t
r_out
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
in
)
&&
is_aligned
(
out
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
incoming_vals
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
incoming_vals
[
ii
]
=
static_cast
<
float
>
(
in
[
i
]);
// load
load_store
(
r_in
,
in
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
static_cast
<
float
>
(
r_in
[
ii
])
*
scale
;
finite
=
finite
&&
isfinite
(
r_in
[
ii
]);
}
// store
load_store
(
out
,
r_out
,
i_start
,
0
);
}
// note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point unrolling
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
// Non-divergent exit condition for __syncthreads, not necessary here
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_in
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
r_in
[
ii
]
=
in
[
i
];
}
// note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point unrolling
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
static_cast
<
float
>
(
r_in
[
ii
])
*
scale
;
finite
=
finite
&&
isfinite
(
r_in
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
out
[
i
]
=
static_cast
<
out_t
>
(
incoming_vals
[
ii
]
*
scale
)
;
if
(
!
isfinite
(
incoming_vals
[
ii
])
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
}
}
}
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
};
...
...
setup.py
View file @
6e14df49
...
...
@@ -177,7 +177,13 @@ if "--cuda_ext" in sys.argv:
'-O3'
,
'--use_fast_math'
]
+
version_dependent_macros
}))
else
:
print
(
"INFO: Skipping FusedLayerNorm extension."
)
print
(
"INFO: Building FusedLayerNorm extension."
)
ext_modules
.
append
(
CUDAExtension
(
name
=
'fused_layer_norm_cuda'
,
sources
=
[
'csrc/layer_norm_cuda.cpp'
,
'csrc/hip/layer_norm_hip_kernel.hip'
],
extra_compile_args
=
{
'cxx'
:
[
'-O3'
]
+
version_dependent_macros
,
'nvcc'
:
[]}))
if
not
is_rocm_pytorch
:
ext_modules
.
append
(
...
...
tests/L0/run_mlp/test_mlp.py
View file @
6e14df49
...
...
@@ -51,6 +51,116 @@ class TestMLP(unittest.TestCase):
ref_mlp
[
0
].
bias
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_no_bias
(
self
):
for
use_activation
in
[
'none'
,
'relu'
,
'sigmoid'
]:
mlp
=
MLP
(
mlp_sizes
,
bias
=
False
,
activation
=
use_activation
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
],
bias
=
False
)
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp_layers
.
append
(
linear
)
if
use_activation
==
'relu'
:
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
if
use_activation
==
'sigmoid'
:
mlp_layers
.
append
(
nn
.
Sigmoid
())
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
).
requires_grad_
()
ref_input
=
test_input
.
clone
().
detach
().
requires_grad_
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
test_input
.
grad
.
detach
().
cpu
().
numpy
(),
ref_input
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
0
,
rtol
=
100
)
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
100
)
def
test_with_bias
(
self
):
for
use_activation
in
[
'none'
,
'relu'
,
'sigmoid'
]:
mlp
=
MLP
(
mlp_sizes
,
bias
=
True
,
activation
=
use_activation
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
],
bias
=
True
)
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp
.
biases
[
i
].
data
.
copy_
(
linear
.
bias
)
mlp_layers
.
append
(
linear
)
if
use_activation
==
'relu'
:
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
if
use_activation
==
'sigmoid'
:
mlp_layers
.
append
(
nn
.
Sigmoid
())
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
).
requires_grad_
()
ref_input
=
test_input
.
clone
().
detach
().
requires_grad_
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
test_input
.
grad
.
detach
().
cpu
().
numpy
(),
ref_input
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
0
,
rtol
=
1
)
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1
)
np
.
testing
.
assert_allclose
(
mlp
.
biases
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
bias
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_no_grad
(
self
):
mlp
=
MLP
(
mlp_sizes
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
])
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp
.
biases
[
i
].
data
.
copy_
(
linear
.
bias
)
mlp_layers
.
append
(
linear
)
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
)
ref_input
=
test_input
.
clone
().
detach
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_performance_half
(
self
):
mlp
=
MLP
(
mlp_sizes
).
cuda
().
half
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment