Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
ac4ef2d6
Commit
ac4ef2d6
authored
May 01, 2020
by
Kexin Yu
Browse files
Merge branch 'master' of
https://github.com/NVIDIA/apex
parents
85e4af76
cf50dc7c
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1409 additions
and
368 deletions
+1409
-368
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
+82
-31
apex/contrib/csrc/xentropy/xentropy_kernel.cu
apex/contrib/csrc/xentropy/xentropy_kernel.cu
+178
-67
apex/contrib/multihead_attn/self_multihead_attn_func.py
apex/contrib/multihead_attn/self_multihead_attn_func.py
+1
-1
apex/mlp/mlp.py
apex/mlp/mlp.py
+28
-19
csrc/mlp.cpp
csrc/mlp.cpp
+41
-16
csrc/mlp_cuda.cu
csrc/mlp_cuda.cu
+593
-100
csrc/multi_tensor_axpby_kernel.cu
csrc/multi_tensor_axpby_kernel.cu
+66
-27
csrc/multi_tensor_l2norm_kernel.cu
csrc/multi_tensor_l2norm_kernel.cu
+67
-12
csrc/multi_tensor_lamb.cu
csrc/multi_tensor_lamb.cu
+183
-70
csrc/multi_tensor_scale_kernel.cu
csrc/multi_tensor_scale_kernel.cu
+60
-25
tests/L0/run_mlp/test_mlp.py
tests/L0/run_mlp/test_mlp.py
+110
-0
No files found.
apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
View file @
ac4ef2d6
...
...
@@ -14,6 +14,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
#include "type_shim.h"
typedef
enum
{
...
...
@@ -99,24 +110,64 @@ struct AdamFunctor
T
incoming_v
[
ILP
];
T
incoming_g
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
p
)
&&
is_aligned
(
m
)
&&
is_aligned
(
v
)
&&
is_aligned
(
g
)
&&
is_aligned
(
p_copy
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
GRAD_T
tmp_g
[
ILP
];
load_store
(
incoming_p
,
p
,
0
,
i_start
);
load_store
(
incoming_m
,
m
,
0
,
i_start
);
load_store
(
incoming_v
,
v
,
0
,
i_start
);
load_store
(
tmp_g
,
g
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
incoming_g
[
ii
]
=
static_cast
<
T
>
(
tmp_g
[
ii
]);
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
incoming_m
[
ii
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
incoming_v
[
ii
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
incoming_v
[
ii
]
+
eps
);
else
// Mode 1
denom
=
sqrtf
(
incoming_v
[
ii
])
+
eps
;
float
update
=
(
incoming_m
[
ii
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
incoming_p
[
ii
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
tmp_g
[
ii
]
=
static_cast
<
GRAD_T
>
(
incoming_p
[
ii
]);
}
load_store
(
p
,
incoming_p
,
i_start
,
0
);
load_store
(
m
,
incoming_m
,
i_start
,
0
);
load_store
(
v
,
incoming_v
,
i_start
,
0
);
if
(
DEPTH
==
5
)
load_store
(
p_copy
,
tmp_g
,
i_start
,
0
);
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
incoming_p
[
ii
]
=
0
;
incoming_m
[
ii
]
=
0
;
incoming_v
[
ii
]
=
0
;
incoming_g
[
ii
]
=
0
;
incoming_p
[
ii
]
=
0
;
incoming_m
[
ii
]
=
0
;
incoming_v
[
ii
]
=
0
;
incoming_g
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
incoming_p
[
ii
]
=
p
[
i
];
incoming_m
[
ii
]
=
m
[
i
];
incoming_v
[
ii
]
=
v
[
i
];
incoming_g
[
ii
]
=
static_cast
<
T
>
(
g
[
i
]);
}
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
incoming_p
[
ii
]
=
p
[
i
];
incoming_m
[
ii
]
=
m
[
i
];
incoming_v
[
ii
]
=
v
[
i
];
incoming_g
[
ii
]
=
static_cast
<
T
>
(
g
[
i
]);
}
}
// note for clarification to future michael:
...
...
@@ -124,24 +175,25 @@ struct AdamFunctor
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
j
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
int
j
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
j
<
n
&&
j
<
chunk_size
)
{
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
m
[
j
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
v
[
j
]
+
eps
);
else
// Mode 1
denom
=
sqrtf
(
v
[
j
])
+
eps
;
float
update
=
(
m
[
j
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
p
[
j
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
p_copy
[
j
]
=
(
GRAD_T
)
p
[
j
];
}
if
(
j
<
n
&&
j
<
chunk_size
)
{
T
scaled_grad
=
incoming_g
[
ii
]
/
grad_scale
;
m
[
j
]
=
b1
*
incoming_m
[
ii
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
incoming_v
[
ii
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
float
denom
;
if
(
mode
==
ADAM_MODE_0
)
denom
=
sqrtf
(
v
[
j
]
+
eps
);
else
// Mode 1
denom
=
sqrtf
(
v
[
j
])
+
eps
;
float
update
=
(
m
[
j
]
/
denom
)
+
(
decay
*
incoming_p
[
ii
]);
p
[
j
]
=
incoming_p
[
ii
]
-
(
step_size
*
update
);
if
(
DEPTH
==
5
)
p_copy
[
j
]
=
(
GRAD_T
)
p
[
j
];
}
}
}
}
}
};
...
...
@@ -332,4 +384,3 @@ void fused_adam_cuda_mt(
}
THCudaCheck
(
cudaGetLastError
());
}
apex/contrib/csrc/xentropy/xentropy_kernel.cu
View file @
ac4ef2d6
/**
* From PyTorch:
*
*
* Copyright (c) 2016- Facebook, Inc (Adam Paszke)
* Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
...
...
@@ -10,54 +10,54 @@
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
*
*
* From Caffe2:
*
*
* Copyright (c) 2016-present, Facebook Inc. All rights reserved.
*
*
* All contributions by Facebook:
* Copyright (c) 2016 Facebook Inc.
*
*
* All contributions by Google:
* Copyright (c) 2015 Google Inc.
* All rights reserved.
*
*
* All contributions by Yangqing Jia:
* Copyright (c) 2015 Yangqing Jia
* All rights reserved.
*
*
* All contributions from Caffe:
* Copyright(c) 2013, 2014, 2015, the respective contributors
* All rights reserved.
*
*
* All other contributions:
* Copyright(c) 2015, 2016 the respective contributors
* All rights reserved.
*
*
* Caffe2 uses a copyright model similar to Caffe: each contributor holds
* copyright over their contributions to Caffe2. The project versioning records
* all such contribution and copyright details. If a contributor wants to further
* mark their specific copyright on a particular contribution, they should
* indicate their copyright solely in the commit message of the change when it is
* committed.
*
*
* All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
*
* 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
* and IDIAP Research Institute nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
...
...
@@ -70,7 +70,6 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
...
...
@@ -84,6 +83,8 @@
#include "type_shim.h"
#include "compat.h"
#define ALIGN_BYTES 16
using
Tensor
=
at
::
Tensor
;
using
TensorList
=
at
::
TensorList
;
using
ScalarType
=
at
::
ScalarType
;
...
...
@@ -123,7 +124,7 @@ const int max_threads = 1024;
inline
dim3
SoftMax_getBlockSize
(
int
ILP
,
uint64_t
dim_size
)
{
uint64_t
block_size
=
1
;
uint64_t
max_block_size
=
std
::
min
(
dim_size
/
ILP
,
static_cast
<
uint64_t
>
(
max_threads
));
while
(
block_size
<
max_block_size
)
block_size
*=
2
;
while
(
block_size
<
(
max_block_size
/
2
)
)
block_size
*=
2
;
// Launch at least a single warp - the kernel assumes that.
block_size
=
std
::
max
(
block_size
,
static_cast
<
uint64_t
>
(
32
));
return
dim3
(
block_size
);
...
...
@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
template
<
template
<
typename
,
typename
>
class
Reduction
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
AccumT
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
const
Reduction
<
T
,
AccumT
>&
r
,
AccumT
defaultVal
)
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal
=
defaultVal
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal
=
r
(
threadVal
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
T
tmp
[
ILP
];
T
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
threadVal
=
r
(
threadVal
,
tmp
[
j
]);
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
threadVal
=
r
(
threadVal
,
v
[
j
]);
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
threadVal
=
r
(
threadVal
,
data
[
offset
]);
...
...
@@ -319,7 +331,8 @@ ilpReduce(T* data,
template
<
template
<
typename
,
typename
>
class
Reduction1
,
template
<
typename
,
typename
>
class
Reduction2
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
void
ilpReduce
(
T
*
data
,
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
AccumT
*
reducVal1
,
const
Reduction1
<
T
,
AccumT
>&
r1
,
...
...
@@ -328,27 +341,38 @@ ilpReduce(T* data,
const
Reduction2
<
T
,
AccumT
>&
r2
,
AccumT
defaultVal2
)
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LoadT
;
AccumT
threadVal1
=
defaultVal1
;
AccumT
threadVal2
=
defaultVal2
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
threadVal2
=
r2
(
threadVal2
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
// Body (unroll by ILP times)
for
(;
offset
<
size
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
T
tmp
[
ILP
];
T
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
tmp
[
j
]
=
data
[
offset
+
j
*
blockDim
.
x
];
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
threadVal1
=
r1
(
threadVal1
,
tmp
[
j
]);
threadVal2
=
r2
(
threadVal2
,
tmp
[
j
]);
threadVal1
=
r1
(
threadVal1
,
v
[
j
]);
threadVal2
=
r2
(
threadVal2
,
v
[
j
]);
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
{
threadVal1
=
r1
(
threadVal1
,
data
[
offset
]);
...
...
@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
// each block handles a sample in the mini-batch
input
+=
blockIdx
.
x
*
classes
;
//output += blockIdx.x * classes;
const
int
shift
=
((
uint64_t
)
input
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
int64_t
label
=
labels
[
blockIdx
.
x
];
// find the max and sum
accscalar_t
threadMax
,
threadSum
,
max_k
,
sum_k
;
ilpReduce
<
MaxFloat
,
AddFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
shift
,
input
,
classes
,
&
threadMax
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
-
at
::
numeric_limits
<
accscalar_t
>::
max
(),
&
threadSum
,
AddFloat
<
scalar_t
,
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
blockReduce
<
Max
,
Add
,
accscalar_t
>
(
sdata
,
&
max_k
,
threadMax
,
Max
<
accscalar_t
>
(),
...
...
@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
&
sum_k
,
threadSum
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
// reduce all values
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
shift
,
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
sumAll
=
blockReduce
<
Add
,
accscalar_t
>
(
sdata
,
threadExp
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
...
...
@@ -411,20 +435,16 @@ cunn_SoftMaxXEntropyForward(
}
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__global__
void
cunn_SoftMaxXEntropyBackward
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__device__
__forceinline__
void
apply
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
...
...
@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
int
offset
=
threadIdx
.
x
;
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
for
(;
offset
<
classes
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
accscalar_t
tmpLogits
[
ILP
];
...
...
@@ -444,22 +465,112 @@ cunn_SoftMaxXEntropyBackward(
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
gradInput
[
offset
+
j
*
blockDim
.
x
]
=
tmpGradOutput
*
(
std
::
exp
(
tmpLogits
[
j
]
-
coeff
)
-
static_cast
<
accscalar_t
>
(
(
offset
+
j
*
blockDim
.
x
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
std
::
exp
(
tmpLogits
[
j
]
-
coeff
)
-
static_cast
<
accscalar_t
>
(
(
offset
+
j
*
blockDim
.
x
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
((
offset
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
>
__device__
__forceinline__
void
aligned_apply
(
int
shift
,
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
accscalar_t
smooth_positives
=
1.0
-
smoothing
;
accscalar_t
smooth_negatives
=
smoothing
/
classes
;
accscalar_t
tmpGradOutput
=
gradOutput
[
blockIdx
.
x
];
int64_t
label
=
labels
[
blockIdx
.
x
];
accscalar_t
coeff
=
max_log_sum_exp
[
blockIdx
.
x
];
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
logits
-=
shift
;
gradInput
-=
shift
;
classes
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
classes
-=
blockDim
.
x
;
gradInput
+=
blockDim
.
x
;
logits
+=
blockDim
.
x
;
shift
-=
blockDim
.
x
;
}
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
scalar_t
),
ILP
*
alignof
(
scalar_t
)
>::
type
LoadT
;
// input
scalar_t
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
// output
scalar_t
r
[
ILP
];
LoadT
*
result
=
reinterpret_cast
<
LoadT
*>
(
&
r
);
for
(;
offset
*
ILP
<
(
classes
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
logits
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
r
[
j
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
v
[
j
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
ILP
*
offset
+
j
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
reinterpret_cast
<
LoadT
*>
(
gradInput
)[
offset
]
=
*
result
;
}
offset
=
classes
-
last
+
threadIdx
.
x
;
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
gradInput
[
offset
]
=
tmpGradOutput
*
(
std
::
exp
(
static_cast
<
accscalar_t
>
(
logits
[
offset
])
-
coeff
)
-
static_cast
<
accscalar_t
>
(((
offset
-
shift
)
==
label
)
?
1
:
0
)
*
smooth_positives
-
smooth_negatives
);
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__global__
void
cunn_SoftMaxXEntropyBackward
(
scalar_t
*
gradInput
,
scalar_t
*
logits
,
outscalar_t
*
max_log_sum_exp
,
outscalar_t
*
gradOutput
,
int64_t
*
labels
,
const
float
smoothing
,
int
classes
)
{
gradInput
+=
blockIdx
.
x
*
classes
;
logits
+=
blockIdx
.
x
*
classes
;
// Do vectorized load/store when input/output have same alignment
const
int
shift
=
((
uint64_t
)
logits
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
const
int
shift_
=
((
uint64_t
)
gradInput
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
if
(
shift
==
shift_
){
aligned_apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
shift
,
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
else
{
apply
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
>
(
gradInput
,
logits
,
max_log_sum_exp
,
gradOutput
,
labels
,
smoothing
,
classes
);
}
}
template
<
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
std
::
vector
<
Tensor
>
host_softmax_xentropy
(
...
...
@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
// XXX: it assumes that inner_size == 1
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
using
namespace
at
;
DISPATCH_FLOAT_AND_HALF
(
input
.
scalar_type
(),
0
,
"host_softmax_xentropy"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyForward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
2
*
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
...
@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
TORCH_CHECK
(
inner_size
==
1
,
"Currently only inner size 1 supported"
);
const
int
ILP
=
2
;
dim3
grid
(
outer_size
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
DISPATCH_FLOAT_AND_HALF
(
gI
.
scalar_type
(),
0
,
"host_softmax_xentropy_backward"
,
using
accscalar_t
=
acc_type
<
scalar_t_0
,
true
>
;
const
int
ILP
=
sizeof
(
float4
)
/
sizeof
(
scalar_t_0
);
dim3
block
=
SoftMax_getBlockSize
(
ILP
,
dim_size
);
if
(
!
half_to_float
)
{
cunn_SoftMaxXEntropyBackward
<
ILP
,
scalar_t_0
,
accscalar_t
,
scalar_t_0
,
Epilogue
>
<<<
grid
,
block
,
block
.
x
*
sizeof
(
accscalar_t
),
stream
>>>
(
...
...
apex/contrib/multihead_attn/self_multihead_attn_func.py
View file @
ac4ef2d6
...
...
@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
values_grads
=
torch
.
bmm
(
dropout_results
.
transpose
(
1
,
2
),
output_lin_grads
,
out
=
values_grads
.
transpose
(
0
,
1
))
# Mask and Scaling for Dropout (not a publically documented op)
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
dropout_prob_t
[
0
])
dropout_grads
=
torch
.
_masked_scale
(
matmul2_dgrad1
,
dropout_mask
,
1.0
/
(
1.0
-
dropout_prob_t
[
0
])
)
# Softmax Grad (not a publically documented op)
softmax_grads
=
torch
.
_softmax_backward_data
(
dropout_grads
,
softmax_results
,
-
1
,
softmax_results
)
...
...
apex/mlp/mlp.py
View file @
ac4ef2d6
...
...
@@ -7,17 +7,19 @@ from .. import amp
class
MlpFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
*
args
):
output
=
mlp_cuda
.
forward
(
args
)
def
forward
(
ctx
,
bias
,
activation
,
*
args
):
output
=
mlp_cuda
.
forward
(
bias
,
activation
,
args
)
ctx
.
save_for_backward
(
*
args
)
ctx
.
outputs
=
output
ctx
.
bias
=
bias
ctx
.
activation
=
activation
return
output
[
0
]
@
staticmethod
def
backward
(
ctx
,
grad_o
):
grads
=
mlp_cuda
.
backward
(
grad_o
,
ctx
.
outputs
,
ctx
.
saved_tensors
)
grads
=
mlp_cuda
.
backward
(
ctx
.
bias
,
ctx
.
activation
,
grad_o
,
ctx
.
outputs
,
ctx
.
saved_tensors
)
del
ctx
.
outputs
return
tuple
(
grads
)
return
(
None
,
None
,
*
grads
)
mlp_function
=
amp
.
half_function
(
MlpFunction
.
apply
)
...
...
@@ -29,16 +31,21 @@ class MLP(torch.nn.Module):
bias (bool): Default True:
relu (bool): Default True
"""
def
__init__
(
self
,
mlp_sizes
,
bias
=
True
,
relu
=
True
):
if
not
(
bias
and
relu
):
raise
TypeError
(
"bias and relu must be both true."
)
def
__init__
(
self
,
mlp_sizes
,
bias
=
True
,
activation
=
'relu'
):
super
(
MLP
,
self
).
__init__
()
self
.
num_layers
=
len
(
mlp_sizes
)
-
1
self
.
mlp_sizes
=
copy
(
mlp_sizes
)
self
.
bias
=
bias
self
.
relu
=
relu
self
.
bias
=
1
if
bias
else
0
if
activation
is
'none'
:
self
.
activation
=
0
elif
activation
is
'relu'
:
self
.
activation
=
1
elif
activation
is
'sigmoid'
:
self
.
activation
=
2
else
:
raise
TypeError
(
"activation must be relu or none."
)
# ignoring bias = False now
self
.
weights
=
[]
self
.
biases
=
[]
for
i
in
range
(
self
.
num_layers
):
...
...
@@ -46,10 +53,11 @@ class MLP(torch.nn.Module):
self
.
weights
.
append
(
w
)
name
=
'weight_{}'
.
format
(
i
)
setattr
(
self
,
name
,
w
)
b
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
]))
self
.
biases
.
append
(
b
)
name
=
'bias_{}'
.
format
(
i
)
setattr
(
self
,
name
,
b
)
if
self
.
bias
:
b
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
mlp_sizes
[
i
+
1
]))
self
.
biases
.
append
(
b
)
name
=
'bias_{}'
.
format
(
i
)
setattr
(
self
,
name
,
b
)
self
.
reset_parameters
()
...
...
@@ -58,13 +66,14 @@ class MLP(torch.nn.Module):
dimsum
=
weight
.
size
(
0
)
+
weight
.
size
(
1
)
std
=
math
.
sqrt
(
2.
/
float
(
dimsum
))
nn
.
init
.
normal_
(
weight
,
0.
,
std
)
for
bias
in
self
.
biases
:
std
=
math
.
sqrt
(
1.
/
float
(
bias
.
size
(
0
)))
nn
.
init
.
normal_
(
bias
,
0.
,
std
)
if
self
.
bias
:
for
bias
in
self
.
biases
:
std
=
math
.
sqrt
(
1.
/
float
(
bias
.
size
(
0
)))
nn
.
init
.
normal_
(
bias
,
0.
,
std
)
def
forward
(
self
,
input
):
return
mlp_function
(
input
,
*
self
.
weights
,
*
self
.
biases
)
return
mlp_function
(
self
.
bias
,
self
.
activation
,
input
,
*
self
.
weights
,
*
self
.
biases
)
def
extra_repr
(
self
):
s
=
F
"MLP sizes:
{
self
.
mlp_sizes
}
, Bias=
{
self
.
bias
}
,
ReLU=
{
self
.
relu
}
"
s
=
F
"MLP sizes:
{
self
.
mlp_sizes
}
, Bias=
{
self
.
bias
}
,
activation=
{
self
.
activation
}
"
return
s
csrc/mlp.cpp
View file @
ac4ef2d6
...
...
@@ -19,7 +19,9 @@ int mlp_fp(
int
*
output_features
,
T
**
BPtr
,
T
*
Y
,
T
*
reserved_space
);
T
*
reserved_space
,
int
use_bias
,
int
activation
);
template
<
typename
T
>
int
mlp_bp
(
...
...
@@ -35,11 +37,18 @@ int mlp_bp(
T
*
work_space
,
T
*
dX
,
T
**
dwPtr
,
T
**
dbPtr
);
T
**
dbPtr
,
bool
requires_grad
,
int
use_bias
,
int
activation
);
std
::
vector
<
at
::
Tensor
>
mlp_forward
(
int
use_bias
,
int
activation
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
std
::
vector
<
at
::
Tensor
>
mlp_forward
(
std
::
vector
<
at
::
Tensor
>
inputs
)
{
// inputs contains (input, weights, biases)
auto
num_layers
=
(
inputs
.
size
()
-
1
)
/
2
;
auto
num_layers
=
inputs
.
size
()
-
1
;
if
(
use_bias
)
{
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
...
...
@@ -60,7 +69,9 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
std
::
vector
<
scalar_t
*>
b_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
if
(
use_bias
)
{
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
}
}
auto
result
=
mlp_fp
<
scalar_t
>
(
inputs
[
0
].
data_ptr
<
scalar_t
>
(),
...
...
@@ -71,37 +82,48 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
output_features
.
data
(),
b_ptr
.
data
(),
out
.
data_ptr
<
scalar_t
>
(),
reserved_space
.
data_ptr
<
scalar_t
>
());
reserved_space
.
data_ptr
<
scalar_t
>
(),
use_bias
,
activation
);
});
return
{
out
,
reserved_space
};
}
std
::
vector
<
at
::
Tensor
>
mlp_backward
(
at
::
Tensor
grad_o
,
std
::
vector
<
at
::
Tensor
>
fprop_outputs
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
// same code to get sizes and W pointers
auto
num_layers
=
(
inputs
.
size
()
-
1
)
/
2
;
int
use_bias
,
int
activation
,
at
::
Tensor
grad_o
,
std
::
vector
<
at
::
Tensor
>
fprop_outputs
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
auto
num_layers
=
inputs
.
size
()
-
1
;
if
(
use_bias
)
{
// inputs contains (input, weights, biases)
num_layers
/=
2
;
}
auto
batch_size
=
inputs
[
0
].
size
(
0
);
auto
input_features
=
inputs
[
0
].
size
(
1
);
// TODO: not creating empty tensor for it?
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
vector
<
int
>
output_features
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
output_features
.
push_back
(
inputs
[
i
+
1
].
size
(
0
));
}
// create outputs, length of inputs
// TODO: not create bias if not needed
std
::
vector
<
at
::
Tensor
>
outputs
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
outputs
.
push_back
(
at
::
empty
(
inputs
[
i
].
sizes
(),
inputs
[
i
].
type
()));
// clone for testing now
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_
for
ward"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
inputs
[
0
].
type
(),
"mlp_
back
ward"
,
[
&
]
{
std
::
vector
<
scalar_t
*>
w_ptr
;
std
::
vector
<
scalar_t
*>
b_ptr
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
w_ptr
.
push_back
(
inputs
[
i
+
1
].
data_ptr
<
scalar_t
>
());
b_ptr
.
push_back
(
inputs
[
i
+
1
+
num_layers
].
data_ptr
<
scalar_t
>
());
}
std
::
vector
<
scalar_t
*>
outputs_ptr
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
...
...
@@ -127,7 +149,10 @@ std::vector<at::Tensor> mlp_backward(
work_space
.
data_ptr
<
scalar_t
>
(),
outputs_ptr
[
0
],
outputs_ptr
.
data
()
+
1
,
outputs_ptr
.
data
()
+
1
+
num_layers
);
outputs_ptr
.
data
()
+
1
+
num_layers
,
requires_grad
,
use_bias
,
activation
);
});
return
outputs
;
...
...
csrc/mlp_cuda.cu
View file @
ac4ef2d6
This diff is collapsed.
Click to expand it.
csrc/multi_tensor_axpby_kernel.cu
View file @
ac4ef2d6
...
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
,
typename
y_t
,
typename
out_t
>
struct
AxpbyFunctor
{
...
...
@@ -43,46 +54,74 @@ struct AxpbyFunctor
n
-=
chunk_idx
*
chunk_size
;
// Non-divergent exit condition for __syncthreads, not necessary here
float
xs
[
ILP
];
float
ys
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
bool
finite
=
true
;
x_t
r_x
[
ILP
];
y_t
r_y
[
ILP
];
out_t
r_out
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
)
&&
is_aligned
(
y
)
&&
is_aligned
(
out
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
xs
[
ii
]
=
0
;
ys
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
load_store
(
r_y
,
y
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
xs
[
ii
]
=
static_cast
<
float
>
(
x
[
i
]);
ys
[
ii
]
=
static_cast
<
float
>
(
y
[
i
]);
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
if
(
arg_to_check
==
-
1
)
finite
=
finite
&&
(
isfinite
(
r_x
[
ii
])
&&
isfinite
(
r_y
[
ii
]));
if
(
arg_to_check
==
0
)
finite
=
finite
&&
isfinite
(
r_x
[
ii
]);
if
(
arg_to_check
==
1
)
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
}
// store
load_store
(
out
,
r_out
,
i_start
,
0
);
}
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
// Non-divergent exit condition for __syncthreads, not necessary here
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
out
[
i
]
=
static_cast
<
out_t
>
(
a
*
xs
[
ii
]
+
b
*
ys
[
ii
]);
bool
finite
=
true
;
r_x
[
ii
]
=
0
;
r_y
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_x
[
ii
]
=
x
[
i
];
r_y
[
ii
]
=
y
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
a
*
static_cast
<
float
>
(
r_x
[
ii
])
+
b
*
static_cast
<
float
>
(
r_y
[
ii
]);
if
(
arg_to_check
==
-
1
)
finite
=
(
isfinite
(
x
s
[
ii
])
&&
isfinite
(
y
s
[
ii
]));
finite
=
finite
&&
(
isfinite
(
r_
x
[
ii
])
&&
isfinite
(
r_
y
[
ii
]));
if
(
arg_to_check
==
0
)
finite
=
isfinite
(
x
s
[
ii
]);
finite
=
finite
&&
isfinite
(
r_
x
[
ii
]);
if
(
arg_to_check
==
1
)
finite
=
isfinite
(
ys
[
ii
]);
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
finite
=
finite
&&
isfinite
(
r_y
[
ii
]);
}
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
}
}
}
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
};
...
...
csrc/multi_tensor_l2norm_kernel.cu
View file @
ac4ef2d6
...
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
struct
L2NormFunctor
{
...
...
@@ -41,22 +52,44 @@ struct L2NormFunctor
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
float
next
=
static_cast
<
float
>
(
r_
x
[
i
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
...
...
@@ -104,22 +137,44 @@ struct MaxNormFunctor
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
float
next
=
static_cast
<
float
>
(
r_
x
[
i
i
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
...
...
csrc/multi_tensor_lamb.cu
View file @
ac4ef2d6
...
...
@@ -13,6 +13,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
typedef
enum
{
MOMENT_MODE_0
=
0
,
// L2 regularization mode
MOMENT_MODE_1
=
1
// Decoupled weight decay mode
...
...
@@ -68,71 +79,149 @@ struct LAMBStage1Functor
n
-=
chunk_idx
*
chunk_size
;
// see note in multi_tensor_scale_kernel.cu
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
g
)
&&
is_aligned
(
p
)
&&
is_aligned
(
m
)
&&
is_aligned
(
v
))
{
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
T
l_g
[
ILP
];
T
l_p
[
ILP
];
T
l_m
[
ILP
];
T
l_v
[
ILP
];
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
l_g
,
g
,
0
,
i_start
);
if
(
decay
!=
0
)
load_store
(
l_p
,
p
,
0
,
i_start
);
load_store
(
l_m
,
m
,
0
,
i_start
);
load_store
(
l_v
,
v
,
0
,
i_start
);
// unpack
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_g
[
ii
]
=
g
[
i
];
// special ?optimization? for lamb stage 1
r_g
[
ii
]
=
l_g
[
ii
];
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
p
[
i
];
r_p
[
ii
]
=
l_
p
[
i
i
];
}
r_m
[
ii
]
=
m
[
i
];
r_v
[
ii
]
=
v
[
i
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
l_m
[
ii
];
r_v
[
ii
]
=
l_v
[
ii
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
l_p
[
ii
]
=
r_p
[
ii
];
l_m
[
ii
]
=
r_m
[
ii
];
l_v
[
ii
]
=
r_v
[
ii
];
}
// store
load_store
(
g
,
l_p
,
i_start
,
0
);
load_store
(
m
,
l_m
,
i_start
,
0
);
load_store
(
v
,
l_v
,
i_start
,
0
);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
// see note in multi_tensor_scale_kernel.cu
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
MATH_T
r_g
[
ILP
];
MATH_T
r_p
[
ILP
];
MATH_T
r_m
[
ILP
];
MATH_T
r_v
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
g
[
i
]
=
r_p
[
ii
];
m
[
i
]
=
r_m
[
ii
];
v
[
i
]
=
r_v
[
ii
];
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_g
[
ii
]
=
g
[
i
];
// special ?optimization? for lamb stage 1
if
(
decay
==
0
)
{
r_p
[
ii
]
=
MATH_T
(
0
);
}
else
{
r_p
[
ii
]
=
p
[
i
];
}
r_m
[
ii
]
=
m
[
i
];
r_v
[
ii
]
=
v
[
i
];
}
else
{
r_g
[
ii
]
=
MATH_T
(
0
);
r_p
[
ii
]
=
MATH_T
(
0
);
r_m
[
ii
]
=
MATH_T
(
0
);
r_v
[
ii
]
=
MATH_T
(
0
);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
if
(
mode
==
MOMENT_MODE_0
)
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
// L2 on scaled grad
scaled_grad
=
scaled_grad
+
decay
*
r_p
[
ii
];
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
next_m_unbiased
/
denom
;
}
else
{
MATH_T
scaled_grad
=
r_g
[
ii
]
/
clipped_global_grad_norm
;
r_m
[
ii
]
=
r_m
[
ii
]
*
beta1
+
beta3
*
scaled_grad
;
r_v
[
ii
]
=
r_v
[
ii
]
*
beta2
+
(
1
-
beta2
)
*
scaled_grad
*
scaled_grad
;
MATH_T
next_m_unbiased
=
r_m
[
ii
]
/
beta1_correction
;
MATH_T
next_v_unbiased
=
r_v
[
ii
]
/
beta2_correction
;
MATH_T
denom
=
sqrtf
(
next_v_unbiased
)
+
epsilon
;
r_p
[
ii
]
=
(
next_m_unbiased
/
denom
)
+
(
decay
*
r_p
[
ii
]);
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
g
[
i
]
=
r_p
[
ii
];
m
[
i
]
=
r_m
[
ii
];
v
[
i
]
=
r_v
[
ii
];
}
}
}
}
...
...
@@ -181,34 +270,58 @@ struct LAMBStage2Functor
n
-=
chunk_idx
*
chunk_size
;
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
p
)
&&
is_aligned
(
update
))
{
MATH_T
r_p
[
ILP
];
MATH_T
r_update
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
T
r_p
[
ILP
];
T
r_update
[
ILP
];
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
// load
load_store
(
r_p
,
p
,
0
,
i_start
);
load_store
(
r_update
,
update
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
p
[
i
];
r_update
[
ii
]
=
update
[
i
];
r_p
[
ii
]
=
static_cast
<
MATH_T
>
(
r_p
[
ii
])
-
(
ratio
*
static_cast
<
MATH_T
>
(
r_update
[
ii
]));
}
load_store
(
p
,
r_p
,
i_start
,
0
);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
])
;
}
MATH_T
r_p
[
ILP
]
;
MATH_T
r_update
[
ILP
];
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
r_p
[
ii
]
=
p
[
i
];
r_update
[
ii
]
=
update
[
i
];
}
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_p
[
ii
]
=
r_p
[
ii
]
-
(
ratio
*
r_update
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
p
[
i
]
=
r_p
[
ii
];
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
p
[
i
]
=
r_p
[
ii
];
}
}
}
}
...
...
csrc/multi_tensor_scale_kernel.cu
View file @
ac4ef2d6
...
...
@@ -15,6 +15,17 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
in_t
,
typename
out_t
>
struct
ScaleFunctor
{
...
...
@@ -34,44 +45,68 @@ struct ScaleFunctor
in_t
*
in
=
(
in_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
in
+=
chunk_idx
*
chunk_size
;
out_t
*
out
=
(
out_t
*
)
tl
.
addresses
[
1
][
tensor_loc
];
out
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
// Non-divergent exit condition for __syncthreads, not necessary here
float
incoming_vals
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
bool
finite
=
true
;
in_t
r_in
[
ILP
];
out_t
r_out
[
ILP
];
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
in
)
&&
is_aligned
(
out
))
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
incoming_vals
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
incoming_vals
[
ii
]
=
static_cast
<
float
>
(
in
[
i
]);
// load
load_store
(
r_in
,
in
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
static_cast
<
float
>
(
r_in
[
ii
])
*
scale
;
finite
=
finite
&&
isfinite
(
r_in
[
ii
]);
}
// store
load_store
(
out
,
r_out
,
i_start
,
0
);
}
// note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point unrolling
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
}
else
{
// Non-divergent exit condition for __syncthreads, not necessary here
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_in
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
r_in
[
ii
]
=
in
[
i
];
}
// note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point unrolling
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_out
[
ii
]
=
static_cast
<
float
>
(
r_in
[
ii
])
*
scale
;
finite
=
finite
&&
isfinite
(
r_in
[
ii
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
out
[
i
]
=
static_cast
<
out_t
>
(
incoming_vals
[
ii
]
*
scale
)
;
if
(
!
isfinite
(
incoming_vals
[
ii
])
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
}
}
}
if
(
!
finite
)
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
};
...
...
tests/L0/run_mlp/test_mlp.py
View file @
ac4ef2d6
...
...
@@ -51,6 +51,116 @@ class TestMLP(unittest.TestCase):
ref_mlp
[
0
].
bias
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_no_bias
(
self
):
for
use_activation
in
[
'none'
,
'relu'
,
'sigmoid'
]:
mlp
=
MLP
(
mlp_sizes
,
bias
=
False
,
activation
=
use_activation
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
],
bias
=
False
)
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp_layers
.
append
(
linear
)
if
use_activation
==
'relu'
:
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
if
use_activation
==
'sigmoid'
:
mlp_layers
.
append
(
nn
.
Sigmoid
())
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
).
requires_grad_
()
ref_input
=
test_input
.
clone
().
detach
().
requires_grad_
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
test_input
.
grad
.
detach
().
cpu
().
numpy
(),
ref_input
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
0
,
rtol
=
100
)
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
100
)
def
test_with_bias
(
self
):
for
use_activation
in
[
'none'
,
'relu'
,
'sigmoid'
]:
mlp
=
MLP
(
mlp_sizes
,
bias
=
True
,
activation
=
use_activation
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
],
bias
=
True
)
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp
.
biases
[
i
].
data
.
copy_
(
linear
.
bias
)
mlp_layers
.
append
(
linear
)
if
use_activation
==
'relu'
:
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
if
use_activation
==
'sigmoid'
:
mlp_layers
.
append
(
nn
.
Sigmoid
())
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
).
requires_grad_
()
ref_input
=
test_input
.
clone
().
detach
().
requires_grad_
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
test_input
.
grad
.
detach
().
cpu
().
numpy
(),
ref_input
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
0
,
rtol
=
1
)
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1
)
np
.
testing
.
assert_allclose
(
mlp
.
biases
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
bias
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_no_grad
(
self
):
mlp
=
MLP
(
mlp_sizes
).
cuda
()
mlp_layers
=
[]
for
i
in
range
(
mlp
.
num_layers
):
linear
=
nn
.
Linear
(
mlp_sizes
[
i
],
mlp_sizes
[
i
+
1
])
mlp
.
weights
[
i
].
data
.
copy_
(
linear
.
weight
)
mlp
.
biases
[
i
].
data
.
copy_
(
linear
.
bias
)
mlp_layers
.
append
(
linear
)
mlp_layers
.
append
(
nn
.
ReLU
(
inplace
=
True
))
ref_mlp
=
nn
.
Sequential
(
*
mlp_layers
).
cuda
()
test_input
=
torch
.
empty
(
batch_size
,
mlp_sizes
[
0
],
device
=
"cuda"
).
uniform_
(
-
1.
,
1.
)
ref_input
=
test_input
.
clone
().
detach
()
mlp_out
=
mlp
(
test_input
)
ref_out
=
ref_mlp
(
ref_input
)
np
.
testing
.
assert_allclose
(
mlp_out
.
detach
().
cpu
().
numpy
(),
ref_out
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out
.
mean
().
mul
(
10.
).
backward
()
ref_out
.
mean
().
mul
(
10.
).
backward
()
np
.
testing
.
assert_allclose
(
mlp
.
weights
[
0
].
grad
.
detach
().
cpu
().
numpy
(),
ref_mlp
[
0
].
weight
.
grad
.
detach
().
cpu
().
numpy
(),
atol
=
1e-7
,
rtol
=
1e-5
)
def
test_performance_half
(
self
):
mlp
=
MLP
(
mlp_sizes
).
cuda
().
half
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment