Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
d6485684
Commit
d6485684
authored
Dec 26, 2019
by
Mohammad Shoeybi
Browse files
fixed gpt-2 dataloder
parent
b8e0129f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
1 deletion
+4
-1
pretrain_gpt2.py
pretrain_gpt2.py
+4
-1
No files found.
pretrain_gpt2.py
View file @
d6485684
...
@@ -115,7 +115,7 @@ def get_train_val_test_data(args):
...
@@ -115,7 +115,7 @@ def get_train_val_test_data(args):
if
args
.
data_loader
==
'numpy'
:
if
args
.
data_loader
==
'numpy'
:
(
train_data
,
val_data
,
test_data
),
num_tokens
,
\
(
train_data
,
val_data
,
test_data
),
num_tokens
,
\
eod_token
=
make_gpt2_dataloaders
(
args
)
eod_token
=
make_gpt2_dataloaders
(
args
)
elif
args
.
data_loader
==
'raw'
or
args
.
data_loader
==
'
tfrecords
'
elif
args
.
data_loader
==
'raw'
or
args
.
data_loader
==
'
lazy
'
data_config
=
configure_data
()
data_config
=
configure_data
()
data_config
.
set_defaults
(
data_set_type
=
'GPT2'
,
transpose
=
False
)
data_config
.
set_defaults
(
data_set_type
=
'GPT2'
,
transpose
=
False
)
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
...
@@ -123,6 +123,9 @@ def get_train_val_test_data(args):
...
@@ -123,6 +123,9 @@ def get_train_val_test_data(args):
num_tokens
=
tokenizer
.
num_tokens
num_tokens
=
tokenizer
.
num_tokens
eod_token
=
tokenizer
.
get_command
(
'eos'
).
Id
eod_token
=
tokenizer
.
get_command
(
'eos'
).
Id
assert
eod_token
==
tokenizer
.
get_command
(
'pad'
).
Id
assert
eod_token
==
tokenizer
.
get_command
(
'pad'
).
Id
else
:
print
(
"Unsupported data loader for GPT2."
)
exit
(
1
)
# pad.
# pad.
num_tokens
=
vocab_size_with_padding
(
num_tokens
,
args
)
num_tokens
=
vocab_size_with_padding
(
num_tokens
,
args
)
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment