Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
ba2264ab
Commit
ba2264ab
authored
Apr 02, 2020
by
Raul Puri
Browse files
verified zeroshot tasks works
parent
159fcfeb
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
17 additions
and
34 deletions
+17
-34
tasks/run_gpt2_eval.py
tasks/run_gpt2_eval.py
+9
-27
tasks/zeroshot_gpt2/datasets.py
tasks/zeroshot_gpt2/datasets.py
+3
-3
tasks/zeroshot_gpt2/detokenizer.py
tasks/zeroshot_gpt2/detokenizer.py
+2
-2
tasks/zeroshot_gpt2/evaluate.py
tasks/zeroshot_gpt2/evaluate.py
+3
-2
No files found.
script
s/run_gpt2_eval.py
→
task
s/run_gpt2_eval.py
View file @
ba2264ab
...
@@ -30,19 +30,8 @@ parser.add_argument('--cloze-eval', action='store_true',
...
@@ -30,19 +30,8 @@ parser.add_argument('--cloze-eval', action='store_true',
help
=
'Run lambada cloze eval instead of perplexity eval.'
)
help
=
'Run lambada cloze eval instead of perplexity eval.'
)
parser
.
add_argument
(
'--easy-lambada'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--easy-lambada'
,
action
=
'store_true'
,
help
=
'use easier formulation of lambada'
)
help
=
'use easier formulation of lambada'
)
parser
.
add_argument
(
'--webtext-eval'
,
action
=
'store_true'
,
help
=
'Run webtext PPL eval instead of wikitext PPL eval.'
)
parser
.
add_argument
(
'--eval-iters'
,
default
=
5000
,
type
=
int
,
help
=
'number of iterations to run webtext evaluation'
)
parser
.
add_argument
(
'--model-parallel-size'
,
type
=
int
,
default
=
1
,
parser
.
add_argument
(
'--model-parallel-size'
,
type
=
int
,
default
=
1
,
help
=
'model parallel size to use'
)
help
=
'model parallel size to use'
)
parser
.
add_argument
(
'--load-openai'
,
action
=
'store_true'
,
help
=
'Load weights from saved openai/hf checkpoints'
)
parser
.
add_argument
(
'--cache-dir'
,
type
=
str
,
default
=
'cache'
,
help
=
'directory to cache gpt2 tokenizers'
)
parser
.
add_argument
(
'--make-vocab-size-divisible-by'
,
type
=
int
,
default
=
128
,
help
=
'Pad the vocab size to be divisible by this value.'
'This is added for computational efficieny reasons.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
multinode_args
=
''
multinode_args
=
''
...
@@ -54,43 +43,36 @@ CMD = ' --model-parallel-size {model_par} \
...
@@ -54,43 +43,36 @@ CMD = ' --model-parallel-size {model_par} \
--hidden-size {hidden}
\
--hidden-size {hidden}
\
--log-interval 100
\
--log-interval 100
\
--load {model}
\
--load {model}
\
--
eval-
batch-size {batch}
\
--batch-size {batch}
\
--num-attention-heads {natt}
\
--num-attention-heads {natt}
\
--seq-length 1024
\
--seq-length 1024
\
--max-position-embeddings 1024
\
--max-position-embeddings 1024
\
--tokenizer-type GPT2BPETokenizer
\
--tokenizer-type GPT2BPETokenizer
\
--text-key text
\
--distributed-backend nccl
\
--distributed-backend nccl
\
--hidden-dropout 0.1
\
--hidden-dropout 0.1
\
--attention-dropout 0.1
\
--attention-dropout 0.1
\
--fp16
\
--fp16
\
--lr 1 --no-load-optim --no-load-rng --epochs 0
\
--overlapping-eval 32
\
--overlapping-eval 32
\
--m
ake-vocab-size-divisible-by {make_vocab_size_divisible_by}
\
--m
erge-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/merges.txt
\
--
cache-dir {cache}
'
.
format
(
model_par
=
args
.
model_parallel_size
,
--
vocab-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/vocab.json
'
.
format
(
model_par
=
args
.
model_parallel_size
,
nlayers
=
args
.
num_layers
,
nlayers
=
args
.
num_layers
,
hidden
=
args
.
hidden_size
,
hidden
=
args
.
hidden_size
,
model
=
args
.
model_path
,
model
=
args
.
model_path
,
batch
=
args
.
batch_size
,
batch
=
args
.
batch_size
,
natt
=
args
.
num_attention_heads
,
natt
=
args
.
num_attention_heads
,)
make_vocab_size_divisible_by
=
args
.
make_vocab_size_divisible_by
,
cache
=
args
.
cache_dir
)
if
args
.
load_openai
:
CMD
+=
' --load-openai '
if
args
.
cloze_eval
:
if
args
.
cloze_eval
:
CMD
+=
' --valid-data {} '
.
format
(
args
.
data_path
)
CMD
+=
' --valid-data {} '
.
format
(
args
.
data_path
)
CMD
+=
' --
cloze-eval
'
CMD
+=
' --
task LAMBADA
'
if
not
args
.
easy_lambada
:
if
not
args
.
easy_lambada
:
CMD
+=
' --strict-lambada '
CMD
+=
' --strict-lambada '
CMD
=
'
evaluate_gpt2
.py'
+
CMD
CMD
=
'
main
.py'
+
CMD
print
(
'Running Lambada Eval Command:'
,
flush
=
True
)
print
(
'Running Lambada Eval Command:'
,
flush
=
True
)
elif
args
.
webtext_eval
:
CMD
+=
'--train-iters 0 --eval-iters {} --test-data {} --loose-json '
.
format
(
args
.
eval_iters
,
args
.
data_path
)
CMD
=
'pretrain_gpt2.py'
+
CMD
print
(
'Running Webtext Eval Command:'
,
flush
=
True
)
else
:
else
:
CMD
+=
' --valid-data {} '
.
format
(
args
.
data_path
)
CMD
+=
' --valid-data {} '
.
format
(
args
.
data_path
)
CMD
=
'evaluate_gpt2.py'
+
CMD
CMD
+=
' --task WIKITEXT103 '
CMD
=
'main.py'
+
CMD
print
(
'Running PPL Eval Command:'
,
flush
=
True
)
print
(
'Running PPL Eval Command:'
,
flush
=
True
)
CMD
=
'python3 '
+
multinode_args
+
CMD
CMD
=
'python3 '
+
multinode_args
+
CMD
...
...
tasks/zeroshot_gpt2/datasets.py
View file @
ba2264ab
...
@@ -132,7 +132,7 @@ def _build_lambada_dataset():
...
@@ -132,7 +132,7 @@ def _build_lambada_dataset():
tokenizer
=
get_tokenizer
()
tokenizer
=
get_tokenizer
()
assert
len
(
args
.
valid_data
)
==
1
assert
len
(
args
.
valid_data
)
==
1
val_dataset
=
_LambadaDataset
(
args
.
valid_data
,
tokenizer
.
eod
,
tokenizer
,
val_dataset
=
_LambadaDataset
(
args
.
valid_data
[
0
]
,
tokenizer
.
eod
,
tokenizer
,
args
.
seq_length
,
args
.
strict_lambada
)
args
.
seq_length
,
args
.
strict_lambada
)
print_rank_0
(
' > found {} samples.'
.
format
(
len
(
val_dataset
)))
print_rank_0
(
' > found {} samples.'
.
format
(
len
(
val_dataset
)))
...
@@ -145,10 +145,10 @@ def _build_wikitext103_dataset():
...
@@ -145,10 +145,10 @@ def _build_wikitext103_dataset():
tokenizer
=
get_tokenizer
()
tokenizer
=
get_tokenizer
()
assert
len
(
args
.
valid_data
)
==
1
assert
len
(
args
.
valid_data
)
==
1
with
open
(
args
.
valid_data
,
"rb"
)
as
reader
:
with
open
(
args
.
valid_data
[
0
]
,
"rb"
)
as
reader
:
entire_data
=
reader
.
read
().
decode
(
'utf-8'
)
entire_data
=
reader
.
read
().
decode
(
'utf-8'
)
num_original_tokens
=
len
(
entire_data
.
strip
().
split
(
" "
))
num_original_tokens
=
len
(
entire_data
.
strip
().
split
(
" "
))
entire_data
=
get_detokenizer
(
args
.
valid_data
)(
entire_data
)
entire_data
=
get_detokenizer
(
args
.
valid_data
[
0
]
)(
entire_data
)
tokenized_data
=
tokenizer
.
tokenize
(
entire_data
)
tokenized_data
=
tokenizer
.
tokenize
(
entire_data
)
num_tokenized_tokens
=
len
(
tokenized_data
)
num_tokenized_tokens
=
len
(
tokenized_data
)
...
...
tasks/zeroshot_gpt2/detokenizer.py
View file @
ba2264ab
...
@@ -19,7 +19,7 @@ import re
...
@@ -19,7 +19,7 @@ import re
def
ptb_detokenizer
(
string
):
def
ptb_detokenizer
(
string
):
string
=
string
.
replace
(
" '"
,
"'"
)
string
=
string
.
replace
(
" '"
,
"'"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
" n't"
,
"n't"
)
string
=
string
.
replace
(
" n't"
,
"n't"
)
...
@@ -75,7 +75,7 @@ _DETOKENIZERS = {
...
@@ -75,7 +75,7 @@ _DETOKENIZERS = {
def
get_detokenizer
(
path
):
def
get_detokenizer
(
path
):
for
key
in
DETOKENIZERS
.
keys
():
for
key
in
_
DETOKENIZERS
.
keys
():
if
key
in
path
:
if
key
in
path
:
print
(
key
)
print
(
key
)
return
_DETOKENIZERS
[
key
]
return
_DETOKENIZERS
[
key
]
...
...
tasks/zeroshot_gpt2/evaluate.py
View file @
ba2264ab
...
@@ -29,7 +29,7 @@ from megatron.training import get_model
...
@@ -29,7 +29,7 @@ from megatron.training import get_model
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
tasks.finetune_utils
import
build_data_loader
from
tasks.finetune_utils
import
build_data_loader
from
.dataset
import
build_dataset
from
.dataset
s
import
build_dataset
def
get_model_provider
(
eval_metric
):
def
get_model_provider
(
eval_metric
):
...
@@ -71,7 +71,8 @@ def process_batch(batch):
...
@@ -71,7 +71,8 @@ def process_batch(batch):
tokenizer
.
eod
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
args
.
eod_mask_loss
,
args
.
fp16
)
return
tokens
,
labels
,
attention_mask
,
position_ids
,
loss_mask
return
tokens
,
labels
,
attention_mask
,
position_ids
,
loss_mask
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment