Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
a148bd33
Unverified
Commit
a148bd33
authored
Sep 21, 2020
by
RezaYazdaniAminabadi
Committed by
GitHub
Sep 21, 2020
Browse files
Add configurable intermediate size to transformer kernels (#423)
parent
a825f996
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
46 additions
and
27 deletions
+46
-27
csrc/includes/context.h
csrc/includes/context.h
+1
-1
csrc/transformer/ds_transformer_cuda.cpp
csrc/transformer/ds_transformer_cuda.cpp
+13
-6
deepspeed/ops/transformer/transformer.py
deepspeed/ops/transformer/transformer.py
+19
-12
tests/unit/test_cuda_backward.py
tests/unit/test_cuda_backward.py
+6
-4
tests/unit/test_cuda_forward.py
tests/unit/test_cuda_forward.py
+7
-4
No files found.
csrc/includes/context.h
View file @
a148bd33
...
@@ -69,7 +69,7 @@ public:
...
@@ -69,7 +69,7 @@ public:
if
(
!
_workspace
)
{
if
(
!
_workspace
)
{
assert
(
_workspace
==
nullptr
);
assert
(
_workspace
==
nullptr
);
cudaMalloc
(
&
_workspace
,
size
);
cudaMalloc
(
&
_workspace
,
size
);
}
else
if
(
_workSpaceSize
!=
size
)
{
}
else
if
(
_workSpaceSize
<
size
)
{
cudaFree
(
_workspace
);
cudaFree
(
_workspace
);
cudaMalloc
(
&
_workspace
,
size
);
cudaMalloc
(
&
_workspace
,
size
);
}
}
...
...
csrc/transformer/ds_transformer_cuda.cpp
View file @
a148bd33
...
@@ -20,13 +20,14 @@ template <typename T>
...
@@ -20,13 +20,14 @@ template <typename T>
size_t
get_workspace_size
(
int
maxBatchSize
,
size_t
get_workspace_size
(
int
maxBatchSize
,
int
seq_len
,
int
seq_len
,
int
hidden_size
,
int
hidden_size
,
int
intermediate_size
,
int
heads
,
int
heads
,
bool
training
,
bool
training
,
bool
gelu_checkpoint
)
bool
gelu_checkpoint
)
{
{
size_t
workSpacesize
=
4
*
(
size_t
(
maxBatchSize
)
*
seq_len
*
hidden_size
);
size_t
workSpacesize
=
4
*
(
size_t
(
maxBatchSize
)
*
seq_len
*
hidden_size
);
if
(
training
)
{
if
(
training
)
{
workSpacesize
+=
(
std
::
max
((
4
*
size_t
(
maxBatchSize
)
*
seq_len
*
hidden
_size
),
workSpacesize
+=
(
std
::
max
((
size_t
(
maxBatchSize
)
*
seq_len
*
intermediate
_size
),
2
*
(
size_t
(
maxBatchSize
)
*
heads
*
seq_len
*
seq_len
)));
2
*
(
size_t
(
maxBatchSize
)
*
heads
*
seq_len
*
seq_len
)));
if
(
gelu_checkpoint
)
workSpacesize
+=
2
*
(
size_t
(
maxBatchSize
)
*
seq_len
*
hidden_size
);
if
(
gelu_checkpoint
)
workSpacesize
+=
2
*
(
size_t
(
maxBatchSize
)
*
seq_len
*
hidden_size
);
}
}
...
@@ -92,12 +93,12 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id,
...
@@ -92,12 +93,12 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id,
false
,
false
,
!
normalize_invertible
)),
!
normalize_invertible
)),
_ff1
(
typename
FeedForward
<
T
>::
Config
(
batch_size
*
seq_length
,
_ff1
(
typename
FeedForward
<
T
>::
Config
(
batch_size
*
seq_length
,
4
*
hidden
_size
,
_intermediate
_size
,
hidden_size
,
hidden_size
,
gemm_algos
[
1
])),
gemm_algos
[
1
])),
_ff2
(
typename
FeedForward
<
T
>::
Config
(
batch_size
*
seq_length
,
_ff2
(
typename
FeedForward
<
T
>::
Config
(
batch_size
*
seq_length
,
hidden_size
,
hidden_size
,
4
*
hidden
_size
,
_intermediate
_size
,
gemm_algos
[
2
])),
gemm_algos
[
2
])),
_softmax
(
typename
Softmax
<
T
>::
Config
(
batch_size
,
num_heads
,
seq_length
)),
_softmax
(
typename
Softmax
<
T
>::
Config
(
batch_size
,
num_heads
,
seq_length
)),
_gelu
(
typename
Gelu
<
T
>::
Config
(
_batch_size
,
_seq_length
,
_intermediate_size
)),
_gelu
(
typename
Gelu
<
T
>::
Config
(
_batch_size
,
_seq_length
,
_intermediate_size
)),
...
@@ -143,8 +144,13 @@ BertTransformerLayer<T>::~BertTransformerLayer()
...
@@ -143,8 +144,13 @@ BertTransformerLayer<T>::~BertTransformerLayer()
template
<
typename
T
>
template
<
typename
T
>
void
BertTransformerLayer
<
T
>::
Initialize
()
void
BertTransformerLayer
<
T
>::
Initialize
()
{
{
Context
::
Instance
().
GenWorkSpace
(
get_workspace_size
<
T
>
(
Context
::
Instance
().
GenWorkSpace
(
get_workspace_size
<
T
>
(
_batch_size
,
_batch_size
,
_seq_length
,
_hidden_size
,
_heads
,
_training
,
_gelu_checkpoint
));
_seq_length
,
_hidden_size
,
_intermediate_size
,
_heads
,
_training
,
_gelu_checkpoint
));
if
(
std
::
is_same
<
T
,
__half
>::
value
)
cublasSetMathMode
(
_cublasHandle
,
CUBLAS_TENSOR_OP_MATH
);
if
(
std
::
is_same
<
T
,
__half
>::
value
)
cublasSetMathMode
(
_cublasHandle
,
CUBLAS_TENSOR_OP_MATH
);
}
}
...
@@ -343,7 +349,8 @@ void BertTransformerLayer<T>::Backward(int bsz,
...
@@ -343,7 +349,8 @@ void BertTransformerLayer<T>::Backward(int bsz,
T
*
buf_2
=
buf_1
+
small_buf_size
;
T
*
buf_2
=
buf_1
+
small_buf_size
;
T
*
buf_3
=
buf_2
+
small_buf_size
;
T
*
buf_3
=
buf_2
+
small_buf_size
;
T
*
ff2_buf
=
buf_3
+
(
_gelu_checkpoint
?
3
:
1
)
*
small_buf_size
;
T
*
ff2_buf
=
(
_gelu_checkpoint
?
buf_2
+
(
bsz
*
_seq_length
*
_intermediate_size
)
:
buf_3
+
small_buf_size
);
T
*
ctx_bufB_ptr_recomp
=
ff2_buf
+
(
_seq_length
*
_seq_length
*
bsz
*
_heads
);
T
*
ctx_bufB_ptr_recomp
=
ff2_buf
+
(
_seq_length
*
_seq_length
*
bsz
*
_heads
);
cudaStream_t
streams
[
2
]
=
{
_stream
,
_stream
};
cudaStream_t
streams
[
2
]
=
{
_stream
,
_stream
};
...
...
deepspeed/ops/transformer/transformer.py
100644 → 100755
View file @
a148bd33
...
@@ -18,6 +18,7 @@ class TransformerConfig():
...
@@ -18,6 +18,7 @@ class TransformerConfig():
batch_size
,
batch_size
,
max_seq_length
,
max_seq_length
,
hidden_size
,
hidden_size
,
intermediate_size
,
heads
,
heads
,
attn_dropout_ratio
,
attn_dropout_ratio
,
hidden_dropout_ratio
,
hidden_dropout_ratio
,
...
@@ -26,6 +27,7 @@ class TransformerConfig():
...
@@ -26,6 +27,7 @@ class TransformerConfig():
self
.
layer_id
=
-
1
self
.
layer_id
=
-
1
self
.
batch_size
=
batch_size
self
.
batch_size
=
batch_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
max_seq_length
=
max_seq_length
self
.
max_seq_length
=
max_seq_length
self
.
heads
=
heads
self
.
heads
=
heads
self
.
attn_dropout_ratio
=
attn_dropout_ratio
self
.
attn_dropout_ratio
=
attn_dropout_ratio
...
@@ -44,6 +46,8 @@ class DeepSpeedTransformerConfig(TransformerConfig):
...
@@ -44,6 +46,8 @@ class DeepSpeedTransformerConfig(TransformerConfig):
hidden_size: The hidden size of the transformer layer
hidden_size: The hidden size of the transformer layer
intermediate_size: The intermediate size of the feed-forward part of transformer layer
heads: The number of heads in the self-attention of the transformer layer
heads: The number of heads in the self-attention of the transformer layer
attn_dropout_ratio: The ratio of dropout for the attention's output
attn_dropout_ratio: The ratio of dropout for the attention's output
...
@@ -88,6 +92,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
...
@@ -88,6 +92,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
batch_size
=-
1
,
batch_size
=-
1
,
max_seq_length
=-
1
,
max_seq_length
=-
1
,
hidden_size
=-
1
,
hidden_size
=-
1
,
intermediate_size
=-
1
,
heads
=-
1
,
heads
=-
1
,
attn_dropout_ratio
=-
1
,
attn_dropout_ratio
=-
1
,
hidden_dropout_ratio
=-
1
,
hidden_dropout_ratio
=-
1
,
...
@@ -103,14 +108,16 @@ class DeepSpeedTransformerConfig(TransformerConfig):
...
@@ -103,14 +108,16 @@ class DeepSpeedTransformerConfig(TransformerConfig):
attn_dropout_checkpoint
=
False
,
attn_dropout_checkpoint
=
False
,
stochastic_mode
=
False
):
stochastic_mode
=
False
):
super
(
DeepSpeedTransformerConfig
,
super
(
DeepSpeedTransformerConfig
,
self
).
__init__
(
batch_size
,
self
).
__init__
(
max_seq_length
,
batch_size
,
hidden_size
,
max_seq_length
,
heads
,
hidden_size
,
attn_dropout_ratio
,
(
intermediate_size
if
intermediate_size
>
0
else
4
*
hidden_size
),
hidden_dropout_ratio
,
heads
,
num_hidden_layers
,
attn_dropout_ratio
,
initializer_range
)
hidden_dropout_ratio
,
num_hidden_layers
,
initializer_range
)
self
.
fp16
=
fp16
self
.
fp16
=
fp16
self
.
pre_layer_norm
=
pre_layer_norm
self
.
pre_layer_norm
=
pre_layer_norm
self
.
local_rank
=
local_rank
self
.
local_rank
=
local_rank
...
@@ -432,12 +439,12 @@ class DeepSpeedTransformerLayer(nn.Module):
...
@@ -432,12 +439,12 @@ class DeepSpeedTransformerLayer(nn.Module):
self
.
attn_nw
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
attn_nw
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
attn_nb
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
attn_nb
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
inter_w
=
nn
.
Parameter
(
self
.
inter_w
=
nn
.
Parameter
(
torch
.
Tensor
(
4
*
self
.
config
.
hidden
_size
,
torch
.
Tensor
(
self
.
config
.
intermediate
_size
,
self
.
config
.
hidden_size
))
self
.
config
.
hidden_size
))
self
.
inter_b
=
nn
.
Parameter
(
torch
.
Tensor
(
4
*
self
.
config
.
hidden
_size
))
self
.
inter_b
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
intermediate
_size
))
self
.
output_w
=
nn
.
Parameter
(
self
.
output_w
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
,
torch
.
Tensor
(
self
.
config
.
hidden_size
,
4
*
self
.
config
.
hidden
_size
))
self
.
config
.
intermediate
_size
))
self
.
output_b
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
output_b
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
norm_w
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
norm_w
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
norm_b
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
self
.
norm_b
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
config
.
hidden_size
))
...
@@ -485,7 +492,7 @@ class DeepSpeedTransformerLayer(nn.Module):
...
@@ -485,7 +492,7 @@ class DeepSpeedTransformerLayer(nn.Module):
self
.
config
.
batch_size
,
self
.
config
.
batch_size
,
self
.
config
.
hidden_size
,
self
.
config
.
hidden_size
,
self
.
config
.
heads
,
self
.
config
.
heads
,
4
*
self
.
config
.
hidden
_size
,
self
.
config
.
intermediate
_size
,
self
.
config
.
max_seq_length
,
self
.
config
.
max_seq_length
,
self
.
config
.
attn_dropout_ratio
,
self
.
config
.
attn_dropout_ratio
,
self
.
config
.
hidden_dropout_ratio
,
self
.
config
.
hidden_dropout_ratio
,
...
...
tests/unit/test_cuda_backward.py
View file @
a148bd33
...
@@ -146,7 +146,7 @@ def create_models(ds_config):
...
@@ -146,7 +146,7 @@ def create_models(ds_config):
hidden_size
=
ds_config
.
hidden_size
,
hidden_size
=
ds_config
.
hidden_size
,
num_hidden_layers
=
ds_config
.
num_hidden_layers
,
num_hidden_layers
=
ds_config
.
num_hidden_layers
,
num_attention_heads
=
ds_config
.
heads
,
num_attention_heads
=
ds_config
.
heads
,
intermediate_size
=
4
*
ds_config
.
hidden
_size
,
intermediate_size
=
ds_config
.
intermediate
_size
,
hidden_act
=
"gelu"
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
ds_config
.
hidden_dropout_ratio
,
hidden_dropout_prob
=
ds_config
.
hidden_dropout_ratio
,
attention_probs_dropout_prob
=
ds_config
.
attn_dropout_ratio
,
attention_probs_dropout_prob
=
ds_config
.
attn_dropout_ratio
,
...
@@ -166,12 +166,12 @@ def create_models(ds_config):
...
@@ -166,12 +166,12 @@ def create_models(ds_config):
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
[
4
].
data
.
fill_
(
1.0
)
weights
[
4
].
data
.
fill_
(
1.0
)
weights
.
append
(
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
4
*
ds_config
.
hidden
_size
,
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
intermediate
_size
,
ds_config
.
hidden_size
)))
ds_config
.
hidden_size
)))
weights
[
5
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
[
5
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
.
append
(
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
,
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
,
4
*
ds_config
.
hidden
_size
)))
ds_config
.
intermediate
_size
)))
weights
[
6
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
[
6
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
[
7
].
data
.
fill_
(
1.0
)
weights
[
7
].
data
.
fill_
(
1.0
)
...
@@ -181,7 +181,7 @@ def create_models(ds_config):
...
@@ -181,7 +181,7 @@ def create_models(ds_config):
for
i
in
range
(
4
):
for
i
in
range
(
4
):
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
[
i
+
1
].
data
.
zero_
()
biases
[
i
+
1
].
data
.
zero_
()
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
4
*
ds_config
.
hidden
_size
)))
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
intermediate
_size
)))
biases
[
5
].
data
.
zero_
()
biases
[
5
].
data
.
zero_
()
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
[
6
].
data
.
zero_
()
biases
[
6
].
data
.
zero_
()
...
@@ -278,6 +278,7 @@ def test_backward(batch_size,
...
@@ -278,6 +278,7 @@ def test_backward(batch_size,
ds_config
.
layer_id
=
None
ds_config
.
layer_id
=
None
ds_config
.
batch_size
=
batch_size
ds_config
.
batch_size
=
batch_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
intermediate_size
=
hidden_size
ds_config
.
max_seq_length
=
seq_len
ds_config
.
max_seq_length
=
seq_len
ds_config
.
heads
=
heads
ds_config
.
heads
=
heads
ds_config
.
attn_dropout_ratio
=
0.0
ds_config
.
attn_dropout_ratio
=
0.0
...
@@ -314,6 +315,7 @@ def test_backward(batch_size,
...
@@ -314,6 +315,7 @@ def test_backward(batch_size,
# ds_config.layer_id = None
# ds_config.layer_id = None
# ds_config.batch_size = batch_size
# ds_config.batch_size = batch_size
# ds_config.hidden_size = hidden_size
# ds_config.hidden_size = hidden_size
# ds_config.intermediate_size = 4 * hidden_size
# ds_config.max_seq_length = seq_len
# ds_config.max_seq_length = seq_len
# ds_config.heads = heads
# ds_config.heads = heads
# ds_config.attn_dropout_ratio = 0.0
# ds_config.attn_dropout_ratio = 0.0
...
...
tests/unit/test_cuda_forward.py
View file @
a148bd33
...
@@ -113,7 +113,7 @@ def create_models(ds_config):
...
@@ -113,7 +113,7 @@ def create_models(ds_config):
num_hidden_layers
=
ds_config
.
num_hidden_layers
,
num_hidden_layers
=
ds_config
.
num_hidden_layers
,
num_attention_heads
=
ds_config
.
heads
,
num_attention_heads
=
ds_config
.
heads
,
batch_size
=
ds_config
.
batch_size
,
batch_size
=
ds_config
.
batch_size
,
intermediate_size
=
4
*
ds_config
.
hidden
_size
,
intermediate_size
=
ds_config
.
intermediate
_size
,
hidden_act
=
"gelu"
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
ds_config
.
hidden_dropout_ratio
,
hidden_dropout_prob
=
ds_config
.
hidden_dropout_ratio
,
attention_probs_dropout_prob
=
ds_config
.
attn_dropout_ratio
,
attention_probs_dropout_prob
=
ds_config
.
attn_dropout_ratio
,
...
@@ -134,12 +134,12 @@ def create_models(ds_config):
...
@@ -134,12 +134,12 @@ def create_models(ds_config):
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
[
4
].
data
.
fill_
(
1.0
)
weights
[
4
].
data
.
fill_
(
1.0
)
weights
.
append
(
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
4
*
ds_config
.
hidden
_size
,
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
intermediate
_size
,
ds_config
.
hidden_size
)))
ds_config
.
hidden_size
)))
weights
[
5
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
[
5
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
.
append
(
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
,
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
,
4
*
ds_config
.
hidden
_size
)))
ds_config
.
intermediate
_size
)))
weights
[
6
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
[
6
].
data
.
normal_
(
mean
=
0.0
,
std
=
ds_config
.
initializer_range
)
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
weights
[
7
].
data
.
fill_
(
1.0
)
weights
[
7
].
data
.
fill_
(
1.0
)
...
@@ -149,7 +149,7 @@ def create_models(ds_config):
...
@@ -149,7 +149,7 @@ def create_models(ds_config):
for
i
in
range
(
4
):
for
i
in
range
(
4
):
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
[
i
+
1
].
data
.
zero_
()
biases
[
i
+
1
].
data
.
zero_
()
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
4
*
ds_config
.
hidden
_size
)))
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
intermediate
_size
)))
biases
[
5
].
data
.
zero_
()
biases
[
5
].
data
.
zero_
()
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
ds_config
.
hidden_size
)))
biases
[
6
].
data
.
zero_
()
biases
[
6
].
data
.
zero_
()
...
@@ -246,6 +246,7 @@ def test_forward(batch_size,
...
@@ -246,6 +246,7 @@ def test_forward(batch_size,
ds_config
.
layer_id
=
None
ds_config
.
layer_id
=
None
ds_config
.
batch_size
=
batch_size
ds_config
.
batch_size
=
batch_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
intermediate_size
=
4
*
hidden_size
ds_config
.
max_seq_length
=
seq_len
ds_config
.
max_seq_length
=
seq_len
ds_config
.
heads
=
heads
ds_config
.
heads
=
heads
ds_config
.
attn_dropout_ratio
=
0.0
ds_config
.
attn_dropout_ratio
=
0.0
...
@@ -282,6 +283,7 @@ def test_forward_with_small_bsz(batch_size,
...
@@ -282,6 +283,7 @@ def test_forward_with_small_bsz(batch_size,
ds_config
.
layer_id
=
None
ds_config
.
layer_id
=
None
ds_config
.
batch_size
=
batch_size
ds_config
.
batch_size
=
batch_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
intermediate_size
=
4
*
hidden_size
ds_config
.
max_seq_length
=
seq_len
ds_config
.
max_seq_length
=
seq_len
ds_config
.
heads
=
heads
ds_config
.
heads
=
heads
ds_config
.
attn_dropout_ratio
=
0.0
ds_config
.
attn_dropout_ratio
=
0.0
...
@@ -316,6 +318,7 @@ def test_forward_stochastic(batch_size,
...
@@ -316,6 +318,7 @@ def test_forward_stochastic(batch_size,
ds_config
.
layer_id
=
None
ds_config
.
layer_id
=
None
ds_config
.
batch_size
=
batch_size
ds_config
.
batch_size
=
batch_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
hidden_size
=
hidden_size
ds_config
.
intermediate_size
=
4
*
hidden_size
ds_config
.
max_seq_length
=
seq_len
ds_config
.
max_seq_length
=
seq_len
ds_config
.
heads
=
heads
ds_config
.
heads
=
heads
ds_config
.
attn_dropout_ratio
=
0.0
ds_config
.
attn_dropout_ratio
=
0.0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment