Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FastMoE
Commits
bf4388c0
"src/vscode:/vscode.git/clone" did not exist on "f8ed456e79c726a490b5223d9ecf4bcbc1811648"
Commit
bf4388c0
authored
Jan 09, 2021
by
Rick Ho
Browse files
Merge branch 'master' into laekov/batching
parents
284f1424
ef83c893
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
168 additions
and
49 deletions
+168
-49
pytorch/cuda/cuda_stream_manager.cpp
pytorch/cuda/cuda_stream_manager.cpp
+10
-3
pytorch/cuda/cuda_stream_manager.h
pytorch/cuda/cuda_stream_manager.h
+35
-28
pytorch/cuda/moe.py
pytorch/cuda/moe.py
+30
-4
pytorch/cuda/moe_cuda_kernel.cu
pytorch/cuda/moe_cuda_kernel.cu
+13
-4
pytorch/cuda/moe_test.py
pytorch/cuda/moe_test.py
+5
-4
pytorch/cuda/setup.py
pytorch/cuda/setup.py
+1
-1
pytorch/mem_transformer.py
pytorch/mem_transformer.py
+72
-5
pytorch/run_enwik8_base.sh
pytorch/run_enwik8_base.sh
+2
-0
No files found.
pytorch/cuda/cuda_stream_manager.cpp
View file @
bf4388c0
/* TODO: make it ke xue
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#include <cassert>
#include <thread>
#include "cuda_stream_manager.h"
#include "cuda_stream_manager.h"
CudaStreamManager
*
smgr
=
NULL
;
thread_local
CudaStreamManager smgr;
CudaStreamManager
*
getCudaStreamManager
(
const
size_t
num_expert
)
{
CudaStreamManager* getCudaStreamManager(const size_t num_expert, const int device) {
if (!smgr) {
if (!smgr) {
smgr
=
new
CudaStreamManager
(
num_expert
);
smgr = new CudaStreamManager(num_expert
, device
);
}
}
<<<<<<< HEAD
return smgr;
return smgr;
}
}
...
@@ -20,3 +25,5 @@ void CudaStreamManager::sync(int i) {
...
@@ -20,3 +25,5 @@ void CudaStreamManager::sync(int i) {
cudaStreamSynchronize(streams[i]);
cudaStreamSynchronize(streams[i]);
}
}
}
}
}
*/
pytorch/cuda/cuda_stream_manager.h
View file @
bf4388c0
...
@@ -5,42 +5,49 @@
...
@@ -5,42 +5,49 @@
#include <cublas_v2.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
#include <helper_cuda.h>
#include <cstdio>
#define MAX_STREAMS 16
class
CudaStreamManager
{
public:
CudaStreamManager
()
:
num_expert
(
0
),
device
(
0
),
streams
(
NULL
)
{
int
current_device
;
checkCudaErrors
(
cudaGetDevice
(
&
current_device
));
#ifdef MOE_DEBUG
printf
(
"constructor at device %d
\n
"
,
current_device
);
#endif
}
struct
CudaStreamManager
{
void
setup
(
const
size_t
num_expert
,
const
int
device
)
{
const
size_t
num_expert
;
#ifdef MOE_DEBUG
cublasHandle_t
*
handles
;
printf
(
"setup at device %d
\n
"
,
device
);
cudaStream_t
*
streams
;
#endif
this
->
num_expert
=
num_expert
;
CudaStreamManager
(
const
size_t
num_expert_
)
:
num_expert
(
num_expert_
)
{
this
->
device
=
device
;
streams
=
new
cudaStream_t
[
MAX_STREAMS
];
checkCudaErrors
(
cudaSetDevice
(
device
));
handles
=
new
cublasHandle_t
[
MAX_STREAMS
];
streams
=
new
cudaStream_t
[
num_expert
];
for
(
size_t
i
=
0
;
i
<
MAX_STREAMS
;
++
i
)
{
checkCudaErrors
(
cublasCreate
(
&
handle
));
checkCudaErrors
(
cublasCreate
(
handles
+
i
));
for
(
size_t
i
=
0
;
i
<
num_expert
;
++
i
)
{
checkCudaErrors
(
cudaStreamCreate
(
streams
+
i
));
checkCudaErrors
(
cudaStreamCreate
(
streams
+
i
));
checkCudaErrors
(
cublasSetStream
(
handles
[
i
],
streams
[
i
]));
}
}
}
}
~
CudaStreamManager
()
{
~
CudaStreamManager
()
{
for
(
size_t
i
=
0
;
i
<
MAX_STREAMS
;
++
i
)
{
#ifdef MOE_DEBUG
checkCudaErrors
(
cudaStreamDestroy
(
streams
[
i
]));
printf
(
"destructor at device %d
\n
"
,
device
);
checkCudaErrors
(
cublasDestroy
(
handles
[
i
]));
#endif
}
for
(
size_t
i
=
0
;
i
<
num_expert
;
++
i
)
{
checkCudaErrors
(
cudaStreamDestroy
(
*
(
streams
+
i
)));
}
checkCudaErrors
(
cublasDestroy
(
handle
));
delete
[]
streams
;
}
}
size_t
num_expert
;
inline
cudaStream_t
&
getStream
(
int
idx
)
{
int
device
;
return
streams
[
idx
%
MAX_STREAMS
];
cublasHandle_t
handle
;
}
cudaStream_t
*
streams
;
inline
cublasHandle_t
&
getHandle
(
int
idx
)
{
return
handles
[
idx
%
MAX_STREAMS
];
}
void
sync
(
int
=-
1
);
};
};
CudaStreamManager
*
getCudaStreamManager
(
const
size_t
num_expert
);
//
CudaStreamManager* getCudaStreamManager(const size_t num_expert
, const int device
);
#endif // CUDA_STREAM_MANAGER
#endif // CUDA_STREAM_MANAGER
pytorch/cuda/moe.py
View file @
bf4388c0
...
@@ -5,8 +5,6 @@ import torch
...
@@ -5,8 +5,6 @@ import torch
import
moe_cuda
import
moe_cuda
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed
(
42
)
class
MOEFunction
(
Function
):
class
MOEFunction
(
Function
):
@
staticmethod
@
staticmethod
...
@@ -47,7 +45,7 @@ class MOELayer(nn.Module):
...
@@ -47,7 +45,7 @@ class MOELayer(nn.Module):
self
.
weight
.
data
[
i
]
=
linear
.
weight
.
data
self
.
weight
.
data
[
i
]
=
linear
.
weight
.
data
def
forward
(
self
,
inp
,
gate
):
def
forward
(
self
,
inp
,
gate
):
return
MOEFunction
.
apply
(
inp
,
gate
,
self
.
weight
)
return
MOEFunction
.
apply
(
inp
,
gate
.
int
()
,
self
.
weight
)
class
MOELayer_raw
(
nn
.
Module
):
class
MOELayer_raw
(
nn
.
Module
):
...
@@ -89,6 +87,8 @@ def test_module(moe, linear, inp, gate):
...
@@ -89,6 +87,8 @@ def test_module(moe, linear, inp, gate):
def
test
():
def
test
():
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed
(
42
)
batch_size
=
4
batch_size
=
4
num_expert
=
2
num_expert
=
2
in_feat
=
6
in_feat
=
6
...
@@ -112,5 +112,31 @@ def test():
...
@@ -112,5 +112,31 @@ def test():
err
=
(
mo
-
ro
).
abs
().
sum
()
err
=
(
mo
-
ro
).
abs
().
sum
()
print
(
'{} abs err {}'
.
format
(
name
,
err
))
print
(
'{} abs err {}'
.
format
(
name
,
err
))
def
test_dp
():
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed
(
42
)
batch_size
=
6
num_expert
=
4
in_feat
=
2
out_feat
=
3
inp
=
torch
.
rand
(
batch_size
,
in_feat
).
cuda
()
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
()
print
(
"data parallel of a nn.Linear model"
)
linear
=
nn
.
Linear
(
in_feat
,
in_feat
).
cuda
()
linear_dp
=
torch
.
nn
.
DataParallel
(
linear
,
device_ids
=
[
0
,
1
,
2
])
output
=
linear_dp
(
inp
)
print
(
"successful!"
)
print
(
"data parallel of our MoE model"
)
moe
=
MOELayer
(
num_expert
,
in_feat
,
out_feat
).
cuda
()
moe_dp
=
torch
.
nn
.
DataParallel
(
moe
,
device_ids
=
[
0
,
1
,
2
])
for
i
in
range
(
5
):
output
=
moe_dp
(
inp
,
gate
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test
()
# test()
test_dp
()
\ No newline at end of file
pytorch/cuda/moe_cuda_kernel.cu
View file @
bf4388c0
...
@@ -9,6 +9,7 @@
...
@@ -9,6 +9,7 @@
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
#include <helper_cuda.h>
#include <c10/cuda/CUDAGuard.h>
#include "timer.hh"
#include "timer.hh"
...
@@ -20,6 +21,8 @@
...
@@ -20,6 +21,8 @@
// #define MOE_BREAKDOWN
// #define MOE_BREAKDOWN
// #define MOE_DEBUG
// #define MOE_DEBUG
thread_local
CudaStreamManager
smgr
;
template
<
typename
scalar_t
>
template
<
typename
scalar_t
>
__global__
__global__
void
generate_ptr_offset_kernel
(
size_t
n
,
const
scalar_t
*
base
,
size_t
stride
,
void
generate_ptr_offset_kernel
(
size_t
n
,
const
scalar_t
*
base
,
size_t
stride
,
...
@@ -103,7 +106,6 @@ void moe_cuda_forward_impl(
...
@@ -103,7 +106,6 @@ void moe_cuda_forward_impl(
expert_ptr
[
0
]
=
0
;
expert_ptr
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
num_expert
;
++
i
)
{
for
(
int
i
=
1
;
i
<
num_expert
;
++
i
)
{
expert_ptr
[
i
]
=
expert_ptr
[
i
-
1
]
+
expert_count
[
i
-
1
];
expert_ptr
[
i
]
=
expert_ptr
[
i
-
1
]
+
expert_count
[
i
-
1
];
}
int
*
pos
=
new
int
[
batch_size
];
int
*
pos
=
new
int
[
batch_size
];
int
*
d_pos
;
int
*
d_pos
;
...
@@ -197,8 +199,6 @@ void moe_cuda_grad_weight(
...
@@ -197,8 +199,6 @@ void moe_cuda_grad_weight(
const
size_t
out_feat
,
const
size_t
out_feat
,
const
size_t
num_expert
)
{
const
size_t
num_expert
)
{
auto
h
=
getCudaStreamManager
(
num_expert
);
int
*
gate_host
=
new
int
[
batch_size
];
int
*
gate_host
=
new
int
[
batch_size
];
scalar_t
alpha
=
1
,
beta
=
1
;
scalar_t
alpha
=
1
,
beta
=
1
;
checkCudaErrors
(
cudaMemcpy
(
gate_host
,
gate
,
batch_size
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
));
checkCudaErrors
(
cudaMemcpy
(
gate_host
,
gate
,
batch_size
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
));
...
@@ -220,7 +220,7 @@ void moe_cuda_grad_weight(
...
@@ -220,7 +220,7 @@ void moe_cuda_grad_weight(
out_feat
));
out_feat
));
}
}
for
(
size_t
i
=
0
;
i
<
num_expert
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
num_expert
;
++
i
)
{
checkCudaErrors
(
cudaStreamSynchronize
(
*
(
h
->
streams
+
i
)));
checkCudaErrors
(
cudaStreamSynchronize
(
*
(
smgr
.
streams
+
i
)));
}
}
delete
[]
gate_host
;
delete
[]
gate_host
;
}
}
...
@@ -238,6 +238,10 @@ std::vector<torch::Tensor> moe_cuda_forward(
...
@@ -238,6 +238,10 @@ std::vector<torch::Tensor> moe_cuda_forward(
#ifdef MOE_DEBUG
#ifdef MOE_DEBUG
printf
(
"[forward] b=%ld, expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld
\n
"
,
batch_size
,
num_expert
,
in_feat
,
out_feat
);
printf
(
"[forward] b=%ld, expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld
\n
"
,
batch_size
,
num_expert
,
in_feat
,
out_feat
);
#endif
#endif
const
int
device
=
device_of
(
input
).
value
().
index
();
if
(
smgr
.
streams
==
NULL
)
{
smgr
.
setup
(
num_expert
,
device
);
}
auto
output
=
input
.
new_zeros
({
batch_size
,
out_feat
});
auto
output
=
input
.
new_zeros
({
batch_size
,
out_feat
});
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_forward_cuda"
,
([
&
]
{
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_forward_cuda"
,
([
&
]
{
...
@@ -266,9 +270,14 @@ std::vector<torch::Tensor> moe_cuda_backward(
...
@@ -266,9 +270,14 @@ std::vector<torch::Tensor> moe_cuda_backward(
const
auto
num_expert
=
weight
.
size
(
0
);
const
auto
num_expert
=
weight
.
size
(
0
);
const
auto
out_feat
=
weight
.
size
(
1
);
const
auto
out_feat
=
weight
.
size
(
1
);
const
auto
in_feat
=
weight
.
size
(
2
);
const
auto
in_feat
=
weight
.
size
(
2
);
#ifdef MOE_DEBUG
#ifdef MOE_DEBUG
printf
(
"[backward] b=%ld, expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld
\n
"
,
batch_size
,
num_expert
,
in_feat
,
out_feat
);
printf
(
"[backward] b=%ld, expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld
\n
"
,
batch_size
,
num_expert
,
in_feat
,
out_feat
);
#endif
#endif
const
int
device
=
device_of
(
input
).
value
().
index
();
if
(
smgr
.
streams
==
NULL
)
{
smgr
.
setup
(
num_expert
,
device
);
}
auto
grad_input
=
grad_output
.
new_zeros
({
batch_size
,
in_feat
});
// batch_size x in_feat
auto
grad_input
=
grad_output
.
new_zeros
({
batch_size
,
in_feat
});
// batch_size x in_feat
auto
grad_weight
=
grad_output
.
new_zeros
({
num_expert
,
out_feat
,
in_feat
});
// num_expert x out_feat x in_feat
auto
grad_weight
=
grad_output
.
new_zeros
({
num_expert
,
out_feat
,
in_feat
});
// num_expert x out_feat x in_feat
...
...
pytorch/cuda/moe_test.py
View file @
bf4388c0
...
@@ -10,10 +10,11 @@ def perf():
...
@@ -10,10 +10,11 @@ def perf():
hidden_feat
=
int
(
sys
.
argv
[
3
])
hidden_feat
=
int
(
sys
.
argv
[
3
])
num_expert
=
int
(
sys
.
argv
[
4
])
num_expert
=
int
(
sys
.
argv
[
4
])
inp
=
torch
.
rand
(
batch_size
,
io_feat
).
cuda
()
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
()
moe
=
MOELayer
(
num_expert
,
io_feat
,
hidden_feat
,
io_feat
).
cuda
()
inp
=
torch
.
rand
(
batch_size
,
in_feat
).
cuda
(
"cuda:1"
)
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
(
"cuda:1"
)
moe
=
MOELayer
(
num_expert
,
in_feat
,
out_feat
).
cuda
(
"cuda:1"
)
o
=
moe
(
inp
,
gate
)
o
=
moe
(
inp
,
gate
)
o
=
moe
(
inp
,
gate
)
o
=
moe
(
inp
,
gate
)
...
@@ -27,7 +28,7 @@ def perf():
...
@@ -27,7 +28,7 @@ def perf():
maxt
=
0.
maxt
=
0.
sqtot
=
0.
sqtot
=
0.
for
i
in
range
(
n_runs
):
for
i
in
range
(
n_runs
):
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
()
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
(
"cuda:1"
)
ts
=
time
.
time
()
ts
=
time
.
time
()
o
=
moe
(
inp
,
gate
)
o
=
moe
(
inp
,
gate
)
te
=
time
.
time
()
te
=
time
.
time
()
...
...
pytorch/cuda/setup.py
View file @
bf4388c0
...
@@ -11,7 +11,7 @@ setup(
...
@@ -11,7 +11,7 @@ setup(
name
=
'moe_cuda'
,
name
=
'moe_cuda'
,
sources
=
[
sources
=
[
'moe.cpp'
,
'moe.cpp'
,
'cuda_stream_manager.cpp'
,
#
'cuda_stream_manager.cpp',
'moe_cuda_kernel.cu'
,
'moe_cuda_kernel.cu'
,
],
],
extra_compile_args
=
{
'cxx'
:
[
'-I{}'
.
format
(
CUDA_HELPER
)],
extra_compile_args
=
{
'cxx'
:
[
'-I{}'
.
format
(
CUDA_HELPER
)],
...
...
pytorch/mem_transformer.py
View file @
bf4388c0
...
@@ -9,6 +9,8 @@ import torch.nn as nn
...
@@ -9,6 +9,8 @@ import torch.nn as nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
# import torch_sparse
# import torch_sparse
from
cuda.moe
import
MOELayer
sys
.
path
.
append
(
'utils'
)
sys
.
path
.
append
(
'utils'
)
from
proj_adaptive_softmax
import
ProjectedAdaptiveLogSoftmax
from
proj_adaptive_softmax
import
ProjectedAdaptiveLogSoftmax
from
log_uniform_sampler
import
LogUniformSampler
,
sample_logits
from
log_uniform_sampler
import
LogUniformSampler
,
sample_logits
...
@@ -31,9 +33,74 @@ class PositionalEmbedding(nn.Module):
...
@@ -31,9 +33,74 @@ class PositionalEmbedding(nn.Module):
else
:
else
:
return
pos_emb
[:,
None
,:]
return
pos_emb
[:,
None
,:]
class
MoEPositionwiseFF
(
nn
.
Module
):
class
CustomizedMoEPositionwiseFF
(
nn
.
Module
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
top_k
=
2
,
num_expert
=
32
):
super
(
CustomizedMoEPositionwiseFF
,
self
).
__init__
()
print
(
"CustomizedMoEPositionwiseFF num_expert=%d top_k=%d"
%
(
num_expert
,
top_k
))
self
.
top_k
=
top_k
assert
num_expert
>=
top_k
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
self
.
dropout
=
dropout
self
.
gate
=
nn
.
Linear
(
d_model
,
num_expert
)
self
.
moe1
=
MOELayer
(
num_expert
=
num_expert
,
in_feat
=
d_model
+
1
,
out_feat
=
d_inner
)
self
.
moe2
=
MOELayer
(
num_expert
=
num_expert
,
in_feat
=
d_inner
+
1
,
out_feat
=
d_model
)
self
.
layer_norm
=
nn
.
LayerNorm
(
d_model
)
self
.
pre_lnorm
=
pre_lnorm
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
reset_parameter
()
def
reset_parameter
(
self
):
pass
def
forward
(
self
,
inp
):
residual
=
inp
if
self
.
pre_lnorm
:
inp
=
self
.
layer_norm
(
inp
)
gate
=
self
.
gate
(
inp
)
gate_top_k_val
,
gate_top_k_idx
=
torch
.
topk
(
gate
,
k
=
self
.
top_k
,
dim
=-
1
,
largest
=
True
,
sorted
=
False
)
# [.. x top_k]
gate_top_k_val
=
gate_top_k_val
.
view
(
-
1
,
self
.
top_k
)
gate_score
=
F
.
softmax
(
gate_top_k_val
,
dim
=-
1
).
unsqueeze
(
1
)
# (BxL) x 1 x top_k
gate_top_k_idx
=
gate_top_k_idx
.
view
(
-
1
,
self
.
top_k
)
core_out
=
[]
inp
=
inp
.
view
(
-
1
,
self
.
d_model
)
inp
=
F
.
pad
(
inp
,
pad
=
(
0
,
1
),
mode
=
'constant'
,
value
=
1.0
)
for
i
in
range
(
self
.
top_k
):
gate_idx
=
gate_top_k_idx
[:,
i
].
contiguous
()
x
=
self
.
moe1
(
inp
,
gate_idx
)
x
=
self
.
dropout
(
F
.
relu
(
x
))
x
=
F
.
pad
(
x
,
pad
=
(
0
,
1
),
mode
=
'constant'
,
value
=
1.0
)
x
=
self
.
moe2
(
x
,
gate_idx
)
x
=
self
.
dropout
(
x
)
# (BxL) x d_model
core_out
.
append
(
x
.
unsqueeze
(
1
))
# (BxL) x 1 x d_model
core_out
=
torch
.
cat
(
core_out
,
dim
=
1
)
# (BxL) x top_k x d_model
core_out
=
torch
.
bmm
(
gate_score
,
core_out
)
# (BxL) x 1 x d_model
core_out
=
core_out
.
view
(
residual
.
size
(
0
),
residual
.
size
(
1
),
self
.
d_model
)
output
=
core_out
+
residual
if
not
self
.
pre_lnorm
:
output
=
self
.
layer_norm
(
output
)
return
output
class
MoEPositionwiseFFRaw
(
nn
.
Module
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
top_k
=
64
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
top_k
=
64
):
super
(
MoEPositionwiseFF
,
self
).
__init__
()
super
(
MoEPositionwiseFF
Raw
,
self
).
__init__
()
print
(
"MoEPositionwiseFF"
)
print
(
"MoEPositionwiseFF"
)
self
.
top_k
=
top_k
self
.
top_k
=
top_k
...
@@ -820,7 +887,7 @@ class DecoderLayer(nn.Module):
...
@@ -820,7 +887,7 @@ class DecoderLayer(nn.Module):
self
.
dec_attn
=
MultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
**
kwargs
)
self
.
dec_attn
=
MultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
**
kwargs
)
# self.dec_attn = ExtendedMultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
# self.dec_attn = ExtendedMultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
self
.
pos_ff
=
MultiHeadHierarchical
MoEPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
self
.
pos_ff
=
Customized
MoEPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
kwargs
.
get
(
'pre_lnorm'
))
pre_lnorm
=
kwargs
.
get
(
'pre_lnorm'
))
def
forward
(
self
,
dec_inp
,
dec_attn_mask
=
None
,
mems
=
None
):
def
forward
(
self
,
dec_inp
,
dec_attn_mask
=
None
,
mems
=
None
):
...
@@ -840,7 +907,7 @@ class RelLearnableDecoderLayer(nn.Module):
...
@@ -840,7 +907,7 @@ class RelLearnableDecoderLayer(nn.Module):
self
.
dec_attn
=
RelLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
self
.
dec_attn
=
RelLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
**
kwargs
)
**
kwargs
)
self
.
pos_ff
=
MultiHeadHierarchical
MoEPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
self
.
pos_ff
=
Customized
MoEPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
kwargs
.
get
(
'pre_lnorm'
))
pre_lnorm
=
kwargs
.
get
(
'pre_lnorm'
))
def
forward
(
self
,
dec_inp
,
r_emb
,
r_w_bias
,
r_bias
,
dec_attn_mask
=
None
,
mems
=
None
):
def
forward
(
self
,
dec_inp
,
r_emb
,
r_w_bias
,
r_bias
,
dec_attn_mask
=
None
,
mems
=
None
):
...
@@ -861,7 +928,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
...
@@ -861,7 +928,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
self
.
dec_attn
=
RelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
self
.
dec_attn
=
RelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
**
kwargs
)
d_head
,
dropout
,
**
kwargs
)
self
.
pos_ff
=
MultiHeadHierarchical
MoEPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
self
.
pos_ff
=
Customized
MoEPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
kwargs
.
get
(
'pre_lnorm'
))
pre_lnorm
=
kwargs
.
get
(
'pre_lnorm'
))
def
forward
(
self
,
dec_inp
,
r
,
r_w_bias
,
r_r_bias
,
dec_attn_mask
=
None
,
mems
=
None
):
def
forward
(
self
,
dec_inp
,
r
,
r_w_bias
,
r_r_bias
,
dec_attn_mask
=
None
,
mems
=
None
):
...
...
pytorch/run_enwik8_base.sh
View file @
bf4388c0
#!/bin/bash
#!/bin/bash
export
LD_LIBRARY_PATH
=
/home/jiezhong/miniconda3/lib:/usr/local/cuda/lib64:
$LD_LIBRARY_PATH
if
[[
$1
==
'train'
]]
;
then
if
[[
$1
==
'train'
]]
;
then
echo
'Run training...'
echo
'Run training...'
python train.py
\
python train.py
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment