Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
31258341
Commit
31258341
authored
Jan 14, 2023
by
unknown
Browse files
修改首页readme
parents
b682a8e2
5c10a883
Changes
226
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1801 additions
and
0 deletions
+1801
-0
Deepspeed/BingBertGlue/turing/logger.py
Deepspeed/BingBertGlue/turing/logger.py
+21
-0
Deepspeed/BingBertGlue/turing/loss.py
Deepspeed/BingBertGlue/turing/loss.py
+60
-0
Deepspeed/BingBertGlue/turing/models.py
Deepspeed/BingBertGlue/turing/models.py
+163
-0
Deepspeed/BingBertGlue/turing/sources.py
Deepspeed/BingBertGlue/turing/sources.py
+509
-0
Deepspeed/BingBertGlue/turing/text.py
Deepspeed/BingBertGlue/turing/text.py
+11
-0
Deepspeed/BingBertGlue/turing/utils.py
Deepspeed/BingBertGlue/turing/utils.py
+169
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
..._adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
+20
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
...1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
+60
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
...Squad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
+60
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
...dam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
+20
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
...bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
+59
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
...uad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
+59
-0
Deepspeed/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
...ad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
+20
-0
Deepspeed/BingBertSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
...rtSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
+56
-0
Deepspeed/BingBertSquad/NOTICE.txt
Deepspeed/BingBertSquad/NOTICE.txt
+36
-0
Deepspeed/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json
...ad/ckpt/bert-large-uncased-whole-word-masking-config.json
+19
-0
Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
+340
-0
Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
+18
-0
Deepspeed/BingBertSquad/evaluate-v1.1.py
Deepspeed/BingBertSquad/evaluate-v1.1.py
+16
-0
Deepspeed/BingBertSquad/evaluate.py
Deepspeed/BingBertSquad/evaluate.py
+85
-0
No files found.
Too many changes to show.
To preserve performance only
226 of 226+
files are displayed.
Plain diff
Email patch
Deepspeed/BingBertGlue/turing/logger.py
0 → 100644
View file @
31258341
import
logging
import
torch.distributed
as
dist
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
class
Logger
():
def
__init__
(
self
,
cuda
=
False
):
self
.
logger
=
logging
.
getLogger
(
__name__
)
self
.
cuda
=
cuda
def
info
(
self
,
message
,
*
args
,
**
kwargs
):
if
(
self
.
cuda
and
dist
.
get_rank
()
==
0
)
or
not
self
.
cuda
:
self
.
logger
.
info
(
message
,
*
args
,
**
kwargs
)
def
error
(
self
,
message
,
*
args
,
**
kwargs
):
self
.
logger
.
error
(
message
,
*
args
,
**
kwargs
)
Deepspeed/BingBertGlue/turing/loss.py
0 → 100644
View file @
31258341
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.autograd
import
Variable
class
FocalLoss
(
nn
.
Module
):
r
"""
This criterion is a implemenation of Focal Loss, which is proposed in
Focal Loss for Dense Object Detection.
Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
The losses are averaged across observations for each minibatch.
Args:
alpha(1D Tensor, Variable) : the scalar factor for this criterion
gamma(float, double) : gamma > 0; reduces the relative loss for well-classified examples (p > .5),
putting more focus on hard, misclassified examples
size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
However, if the field size_average is set to False, the losses are
instead summed for each minibatch.
"""
def
__init__
(
self
,
class_num
,
alpha
=
None
,
gamma
=
2
,
size_average
=
True
):
super
(
FocalLoss
,
self
).
__init__
()
if
alpha
is
None
:
self
.
alpha
=
torch
.
ones
(
class_num
,
1
)
else
:
if
isinstance
(
alpha
,
Variable
):
self
.
alpha
=
alpha
else
:
self
.
alpha
=
Variable
(
alpha
)
self
.
gamma
=
gamma
self
.
class_num
=
class_num
self
.
size_average
=
size_average
def
forward
(
self
,
inputs
,
targets
):
N
=
inputs
.
size
(
0
)
C
=
inputs
.
size
(
1
)
P
=
F
.
softmax
(
inputs
)
class_mask
=
inputs
.
data
.
new
(
N
,
C
).
fill_
(
0
)
# class_mask = Variable(class_mask)
ids
=
targets
.
view
(
-
1
,
1
)
class_mask
.
scatter_
(
1
,
ids
.
data
,
1.
)
if
inputs
.
is_cuda
and
not
self
.
alpha
.
is_cuda
:
self
.
alpha
=
self
.
alpha
.
cuda
()
alpha
=
self
.
alpha
[
ids
.
data
.
view
(
-
1
)]
probs
=
(
P
*
class_mask
).
sum
(
1
).
view
(
-
1
,
1
)
log_p
=
probs
.
log
()
batch_loss
=
-
alpha
*
(
torch
.
pow
((
1
-
probs
),
self
.
gamma
))
*
log_p
if
self
.
size_average
:
loss
=
batch_loss
.
mean
()
else
:
loss
=
batch_loss
.
sum
()
return
loss
Deepspeed/BingBertGlue/turing/models.py
0 → 100644
View file @
31258341
import
torch
import
torch.nn
as
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
turing.utils
import
TorchTuple
from
pytorch_pretrained_bert.modeling
import
BertModel
from
pytorch_pretrained_bert.modeling
import
BertPreTrainingHeads
,
PreTrainedBertModel
,
BertPreTrainingHeads
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
class
BertPretrainingLoss
(
PreTrainedBertModel
):
def
__init__
(
self
,
bert_encoder
,
config
):
super
(
BertPretrainingLoss
,
self
).
__init__
(
config
)
self
.
bert
=
bert_encoder
self
.
cls
=
BertPreTrainingHeads
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
cls
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
next_sentence_label
=
None
):
sequence_output
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
if
masked_lm_labels
is
not
None
and
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
total_loss
=
masked_lm_loss
+
next_sentence_loss
return
total_loss
else
:
return
prediction_scores
,
seq_relationship_score
class
BertClassificationLoss
(
PreTrainedBertModel
):
def
__init__
(
self
,
bert_encoder
,
config
,
num_labels
:
int
=
1
):
super
(
BertClassificationLoss
,
self
).
__init__
(
config
)
self
.
bert
=
bert_encoder
self
.
num_labels
=
num_labels
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
classifier
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
scores
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
nn
.
BCEWithLogitsLoss
()
loss
=
loss_fct
(
scores
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
,
1
))
return
loss
else
:
return
scores
class
BertRegressionLoss
(
PreTrainedBertModel
):
def
__init__
(
self
,
bert_encoder
,
config
):
super
(
BertRegressionLoss
,
self
).
__init__
(
config
)
self
.
bert
=
bert_encoder
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
1
)
self
.
classifier
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
1
),
labels
.
view
(
-
1
,
1
))
return
loss
else
:
return
logits
class
BertMultiTask
:
def
__init__
(
self
,
args
):
self
.
config
=
args
.
config
if
not
args
.
use_pretrain
:
if
args
.
progressive_layer_drop
:
print
(
"BertConfigPreLnLayerDrop"
)
from
nvidia.modelingpreln_layerdrop
import
BertForPreTrainingPreLN
,
BertConfig
else
:
from
nvidia.modelingpreln
import
BertForPreTrainingPreLN
,
BertConfig
bert_config
=
BertConfig
(
**
self
.
config
[
"bert_model_config"
])
bert_config
.
vocab_size
=
len
(
args
.
tokenizer
.
vocab
)
# Padding for divisibility by 8
if
bert_config
.
vocab_size
%
8
!=
0
:
bert_config
.
vocab_size
+=
8
-
(
bert_config
.
vocab_size
%
8
)
print
(
"VOCAB SIZE:"
,
bert_config
.
vocab_size
)
self
.
network
=
BertForPreTrainingPreLN
(
bert_config
,
args
)
# Use pretrained bert weights
else
:
self
.
bert_encoder
=
BertModel
.
from_pretrained
(
self
.
config
[
'bert_model_file'
],
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
/
'distributed_{}'
.
format
(
args
.
local_rank
))
bert_config
=
self
.
bert_encoder
.
config
self
.
device
=
None
def
set_device
(
self
,
device
):
self
.
device
=
device
def
save
(
self
,
filename
:
str
):
network
=
self
.
network
.
module
return
torch
.
save
(
network
.
state_dict
(),
filename
)
def
load
(
self
,
model_state_dict
:
str
):
return
self
.
network
.
module
.
load_state_dict
(
torch
.
load
(
model_state_dict
,
map_location
=
lambda
storage
,
loc
:
storage
))
def
move_batch
(
self
,
batch
:
TorchTuple
,
non_blocking
=
False
):
return
batch
.
to
(
self
.
device
,
non_blocking
)
def
eval
(
self
):
self
.
network
.
eval
()
def
train
(
self
):
self
.
network
.
train
()
def
save_bert
(
self
,
filename
:
str
):
return
torch
.
save
(
self
.
bert_encoder
.
state_dict
(),
filename
)
def
to
(
self
,
device
):
assert
isinstance
(
device
,
torch
.
device
)
self
.
network
.
to
(
device
)
def
half
(
self
):
self
.
network
.
half
()
Deepspeed/BingBertGlue/turing/sources.py
0 → 100644
View file @
31258341
from
tqdm
import
tqdm
from
typing
import
Tuple
from
random
import
shuffle
import
pickle
import
random
import
numpy
as
np
from
pathlib
import
Path
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
def
truncate_input_sequence
(
tokens_a
,
tokens_b
,
max_num_tokens
):
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_num_tokens
:
break
trunc_tokens
=
tokens_a
if
len
(
tokens_a
)
>
len
(
tokens_b
)
else
tokens_b
assert
len
(
trunc_tokens
)
>=
1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if
random
.
random
()
<
0.5
:
del
trunc_tokens
[
0
]
else
:
trunc_tokens
.
pop
()
class
TokenInstance
:
""" This TokenInstance is a obect to have the basic units of data that should be
extracted from the raw text file and can be consumed by any BERT like model.
"""
def
__init__
(
self
,
tokens_a
,
tokens_b
,
is_next
,
lang
=
"en"
):
self
.
tokens_a
=
tokens_a
self
.
tokens_b
=
tokens_b
self
.
is_next
=
is_next
# 0 is if in continuation, 1 if is random
self
.
lang
=
lang
def
get_values
(
self
):
return
(
self
.
tokens_a
,
self
.
tokens_b
,
self
.
is_next
)
def
get_lang
(
self
):
return
self
.
lang
class
QueryPassageDataset
:
def
__init__
(
self
,
path
,
readin
=
20000000
):
all_pairs
=
[]
with
open
(
path
,
encoding
=
"utf-8"
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
qpl_tuple
:
Tuple
[
str
,
str
,
str
]
=
line
.
split
(
'
\t
'
)
all_pairs
.
append
(
qpl_tuple
)
if
i
>
readin
:
break
shuffle
(
all_pairs
)
self
.
all_pairs
=
all_pairs
self
.
len
=
len
(
self
.
all_pairs
)
def
__len__
(
self
):
return
self
.
len
class
QueryPassageFineTuningDataset
:
def
__init__
(
self
,
path
,
readin
=
20000000
):
all_pairs
=
[]
with
open
(
path
,
encoding
=
"utf-8"
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
entities
=
line
.
split
(
'
\t
'
)
qpl_tuple
:
Tuple
[
str
,
str
,
str
]
=
(
entities
[
0
],
entities
[
2
],
entities
[
4
])
all_pairs
.
append
(
qpl_tuple
)
if
i
>
readin
:
break
shuffle
(
all_pairs
)
self
.
all_pairs
=
all_pairs
self
.
len
=
len
(
self
.
all_pairs
)
def
__len__
(
self
):
return
self
.
len
class
QueryInstanceDataset
:
def
__init__
(
self
,
path
,
readin
=
20000000
):
all_pairs
=
[]
with
open
(
path
,
encoding
=
"utf-8"
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
qpl_tuple
:
Tuple
[
str
,
str
,
str
]
=
line
.
split
(
'
\t
'
)
all_pairs
.
append
(
qpl_tuple
)
if
i
>
readin
:
break
shuffle
(
all_pairs
)
self
.
all_pairs
=
all_pairs
self
.
len
=
len
(
self
.
all_pairs
)
def
__len__
(
self
):
return
self
.
len
class
PretrainingDataCreator
:
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
5
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
# Expected format (Q,T,U,S,D)
# query, title, url, snippet, document = line.split('\t')
# ! remove this following line later
document
=
line
if
len
(
document
.
split
(
"<sep>"
))
<=
3
:
continue
lines
=
document
.
split
(
"<sep>"
)
document
=
[]
for
seq
in
lines
:
document
.
append
(
tokenizer
.
tokenize
(
seq
))
# document = list(map(tokenizer.tokenize, lines))
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
def
__len__
(
self
):
return
self
.
len
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
return
state
def
__setstate__
(
self
,
state
):
self
.
__dict__
.
update
(
state
)
def
save
(
self
,
filename
):
with
open
(
filename
,
'wb'
)
as
outfile
:
pickle
.
dump
(
self
,
outfile
)
@
staticmethod
def
load
(
filename
):
with
open
(
filename
,
'rb'
)
as
f
:
return
pickle
.
load
(
f
)
def
create_training_instance
(
self
,
index
):
document
=
self
.
documents
[
index
]
# l = 0
# for s in document:
# l+=len(s)
# print(l)
# print(document)
# Need to add [CLS] + 2*[SEP] tokens
max_num_tokens
=
self
.
max_seq_length
-
3
# We want to maximize the inp sequence but also want inputs similar
# to our generic task inputs which will be compartively smaller
# than the data on which we intend to pre-train.
target_seq_length
=
max_num_tokens
if
random
.
random
()
<
self
.
small_seq_prob
:
target_seq_length
=
random
.
randint
(
5
,
max_num_tokens
)
# Need to make the sequences split for NSP task for interesting
# rather than choosing some arbitrary point. If not the NSP
# task might become way too easy.
instances
=
[]
current_chunk
=
[]
current_length
=
0
i
=
0
while
i
<
len
(
document
):
segment
=
document
[
i
]
current_chunk
.
append
(
segment
)
current_length
+=
len
(
segment
)
if
i
==
len
(
document
)
-
1
or
current_length
>=
target_seq_length
:
if
current_chunk
:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end
=
1
if
len
(
current_chunk
)
>=
2
:
a_end
=
random
.
randint
(
1
,
len
(
current_chunk
)
-
1
)
tokens_a
=
[]
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
current_chunk
[
j
])
tokens_b
=
[]
# Random Next
is_random_next
=
False
if
len
(
current_chunk
)
==
1
or
random
.
random
()
<
0.5
:
is_random_next
=
True
target_b_length
=
target_seq_length
-
len
(
tokens_a
)
# Pick a random document
for
_
in
range
(
10
):
random_doc_index
=
random
.
randint
(
0
,
len
(
self
.
documents
)
-
1
)
if
random_doc_index
!=
index
:
break
random_doc
=
self
.
documents
[
random_doc_index
]
random_start
=
random
.
randint
(
0
,
len
(
random_doc
)
-
1
)
for
j
in
range
(
random_start
,
len
(
random_doc
)):
tokens_b
.
extend
(
random_doc
[
j
])
if
len
(
tokens_b
)
>=
target_b_length
:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments
=
len
(
current_chunk
)
-
a_end
i
-=
num_unused_segments
# Actual Next
else
:
is_random_next
=
False
for
j
in
range
(
a_end
,
len
(
current_chunk
)):
tokens_b
.
extend
(
current_chunk
[
j
])
truncate_input_sequence
(
tokens_a
,
tokens_b
,
max_num_tokens
)
assert
len
(
tokens_a
)
>=
1
assert
len
(
tokens_b
)
>=
1
instances
.
append
(
TokenInstance
(
tokens_a
,
tokens_b
,
int
(
is_random_next
)))
# print(instances[-1])
current_chunk
=
[]
current_length
=
0
i
+=
1
# print(len(instances))
return
instances
class
CleanBodyDataCreator
(
PretrainingDataCreator
):
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
:
int
=
512
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
5
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
url
,
cleanbody
,
rand_int
=
line
.
rstrip
(
"
\n
"
).
split
(
"
\t
"
)
cleanbody
=
cleanbody
.
replace
(
"#TAB#"
,
" "
).
replace
(
"#NULL#"
,
""
).
replace
(
"#HASH#"
,
"#"
)
cleanbody_parts
=
cleanbody
.
split
(
"#R##N#"
)
for
document
in
cleanbody_parts
:
lines
=
document
.
split
(
"#N#"
)
document
=
[]
document_len
=
0
for
seq
in
lines
:
tok_seq
=
tokenizer
.
tokenize
(
seq
)
if
len
(
tok_seq
)
!=
0
:
document
.
append
(
tok_seq
)
document_len
+=
len
(
tok_seq
)
if
document_len
>=
200
:
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
class
WikiNBookCorpusPretrainingDataCreator
(
PretrainingDataCreator
):
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
:
int
=
512
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
6
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
document
=
[]
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if
len
(
line
)
==
0
:
# This is end of document
documents
.
append
(
document
)
document
=
[]
if
len
(
line
.
split
(
' '
))
>
2
:
document
.
append
(
tokenizer
.
tokenize
(
line
))
if
len
(
document
)
>
0
:
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
print
(
documents
[
0
])
print
(
len
(
documents
))
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
class
WikiPretrainingDataCreator
(
PretrainingDataCreator
):
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
:
int
=
512
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
6
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
document
=
[]
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if
len
(
line
)
>
0
and
line
[:
2
]
==
"[["
:
# This is end of document
documents
.
append
(
document
)
document
=
[]
if
len
(
line
.
split
(
' '
))
>
2
:
document
.
append
(
tokenizer
.
tokenize
(
line
))
if
len
(
document
)
>
0
:
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
# print(len(documents))
# print(len(documents[0]))
# print(documents[0][0:10])
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
class
NumpyByteInstances
:
TOKEN_SEP_VAL
=
int
.
from_bytes
(
b
'
\x1f
'
,
byteorder
=
'big'
)
def
__init__
(
self
,
data_creator
):
self
.
data_creator
=
data_creator
self
.
getitem_fixed
=
self
.
sep_getitem_fixed
if
self
.
data_creator
.
use_separators
else
self
.
data_creator
.
nosep_getitem_fixed
# if self.data_creator.multilingual:
# self.__getitem__ = self.getitem_multilingual
# else:
# self.__getitem__ = self.getitem_monolingual
def
getitem_multilingual
(
self
,
i
):
tokens_a
,
tokens_b
,
is_next
=
self
.
getitem_fixed
(
i
)
return
TokenInstance
(
tokens_a
,
tokens_b
,
is_next
,
lang
=
self
.
data_creator
.
lang
[
i
])
def
getitem_monolingual
(
self
,
i
):
return
TokenInstance
(
*
self
.
getitem_fixed
(
i
))
def
__getitem__
(
self
,
i
):
if
self
.
data_creator
.
multilingual
:
return
self
.
getitem_multilingual
(
i
)
else
:
return
self
.
getitem_monolingual
(
i
)
def
nosep_getitem_fixed
(
self
,
i
):
if
i
>
self
.
data_creator
.
len
:
raise
IndexError
if
i
<
0
:
i
+=
self
.
data_creator
.
len
instance_start
,
instance_end
=
self
.
data_creator
.
instance_offsets
[
i
:
i
+
2
]
tok_offsets_start
,
tok_offsets_end
=
self
.
data_creator
.
instance_token_offsets
[
i
:
i
+
2
]
token_offsets
=
self
.
data_creator
.
token_offsets
[
tok_offsets_start
:
tok_offsets_end
]
tokens_split
=
self
.
data_creator
.
tokens_split
[
i
]
token_arrs
=
np
.
split
(
self
.
data_creator
.
data
[
instance_start
:
instance_end
],
token_offsets
)
tokens
=
[
t
.
tostring
().
decode
(
'utf8'
)
for
t
in
token_arrs
]
return
tokens
[:
tokens_split
],
tokens
[
tokens_split
:],
self
.
data_creator
.
is_next
[
i
]
def
sep_getitem_fixed
(
self
,
i
):
if
i
>
self
.
data_creator
.
len
:
raise
IndexError
if
i
<
0
:
i
+=
self
.
data_creator
.
len
instance_start
,
instance_end
=
self
.
data_creator
.
instance_offsets
[
i
:
i
+
2
]
instance_data
=
self
.
data_creator
.
data
[
instance_start
:
instance_end
]
tokens_split
=
self
.
data_creator
.
tokens_split
[
i
]
token_arrs
=
np
.
split
(
instance_data
,
np
.
where
(
instance_data
==
NumpyByteInstances
.
TOKEN_SEP_VAL
)
[
0
])
# split on the token separator
tokens
=
[
(
t
[
1
:]
if
i
>
0
else
t
).
tostring
().
decode
(
'utf8'
)
for
i
,
t
in
enumerate
(
token_arrs
)
]
# ignore first byte, which will be separator, for tokens after the first
return
tokens
[:
tokens_split
],
tokens
[
tokens_split
:],
self
.
data_creator
.
is_next
[
i
]
def
__len__
(
self
):
return
self
.
data_creator
.
len
class
NumpyPretrainingDataCreator
:
def
__init__
(
self
,
path
,
mmap
=
False
):
path
=
Path
(
path
)
self
.
path
=
path
mmap_mode
=
'r'
if
mmap
else
None
self
.
data
=
np
.
load
(
str
(
path
/
'data.npy'
),
mmap_mode
=
mmap_mode
)
self
.
is_next
=
np
.
load
(
str
(
path
/
'is_next.npy'
),
mmap_mode
=
mmap_mode
)
self
.
tokens_split
=
np
.
load
(
str
(
path
/
'tokens_split.npy'
),
mmap_mode
=
mmap_mode
)
self
.
instance_offsets
=
np
.
load
(
str
(
path
/
'instance_offsets.npy'
),
mmap_mode
=
mmap_mode
)
if
(
path
/
'instance_token_offsets.npy'
).
is_file
():
self
.
use_separators
=
False
self
.
instance_token_offsets
=
np
.
load
(
str
(
path
/
'instance_token_offsets.npy'
),
mmap_mode
=
mmap_mode
)
self
.
token_offsets
=
np
.
load
(
str
(
path
/
'token_offsets.npy'
),
mmap_mode
=
mmap_mode
)
else
:
self
.
use_separators
=
True
self
.
instance_token_offsets
=
None
self
.
token_offsets
=
None
if
(
path
/
'lang.npy'
).
is_file
():
self
.
multilingual
=
True
self
.
lang
=
np
.
load
(
str
(
path
/
'lang.npy'
),
mmap_mode
=
mmap_mode
)
else
:
self
.
multilingual
=
False
self
.
lang
=
None
self
.
instances
=
NumpyByteInstances
(
self
)
self
.
len
=
len
(
self
.
is_next
)
def
__len__
(
self
):
return
self
.
len
@
classmethod
def
load
(
cls
,
path
):
return
cls
(
path
)
Deepspeed/BingBertGlue/turing/text.py
0 → 100644
View file @
31258341
import
torch
PAD
=
0
def
mask
(
x
):
return
x
!=
PAD
def
torch_long
(
x
):
return
torch
.
LongTensor
(
x
)
Deepspeed/BingBertGlue/turing/utils.py
0 → 100644
View file @
31258341
import
sys
as
_sys
from
typing
import
List
from
collections
import
_iskeyword
# type: ignore
from
tensorboardX
import
SummaryWriter
import
os
SUMMARY_WRITER_DIR_NAME
=
'runs'
def
get_sample_writer
(
name
,
base
=
".."
):
"""Returns a tensorboard summary writer
"""
return
SummaryWriter
(
log_dir
=
os
.
path
.
join
(
base
,
SUMMARY_WRITER_DIR_NAME
,
name
))
class
TorchTuple
(
tuple
):
def
to
(
self
,
device
,
non_blocking
=
False
):
raise
NotImplementedError
(
""
)
_class_template
=
"""
\
from builtins import property as _property, tuple as _tuple
from operator import itemgetter as _itemgetter
from collections import OrderedDict
from turing.utils import TorchTuple
import torch
class {typename}(TorchTuple):
'{typename}({arg_list})'
__slots__ = ()
_fields = {field_names!r}
def __new__(_cls, {arg_list}):
'Create new instance of {typename}({arg_list})'
return _tuple.__new__(_cls, ({arg_list}))
@classmethod
def _make(cls, iterable, new=tuple.__new__, len=len):
'Make a new {typename} object from a sequence or iterable'
result = new(cls, iterable)
if len(result) != {num_fields:d}:
raise TypeError('Expected {num_fields:d} arguments, got %d' % len(result))
return result
def _replace(_self, **kwds):
'Return a new {typename} object replacing specified fields with new values'
result = _self._make(map(kwds.pop, {field_names!r}, _self))
if kwds:
raise ValueError('Got unexpected field names: %r' % list(kwds))
return result
def __repr__(self):
'Return a nicely formatted representation string'
return self.__class__.__name__ + '({repr_fmt})' % self
@property
def __dict__(self):
'A new OrderedDict mapping field names to their values'
return OrderedDict(zip(self._fields, self))
def _asdict(self):
'''Return a new OrderedDict which maps field names to their values.
This method is obsolete. Use vars(nt) or nt.__dict__ instead.
'''
return self.__dict__
def __getnewargs__(self):
'Return self as a plain tuple. Used by copy and pickle.'
return tuple(self)
def __getstate__(self):
'Exclude the OrderedDict from pickling'
return None
def to(self, device, non_blocking=False):
_dict = self.__dict__.copy()
new_dict = dict()
for key, value in _dict.items():
if isinstance(value, torch.Tensor):
if device.type != 'cpu' and non_blocking and torch.cuda.is_available():
new_dict[key] = value.cuda(device, non_blocking=non_blocking)
else:
new_dict[key] = value.to(device)
else:
new_dict[key] = value
return {typename}(**new_dict)
{field_defs}
"""
_repr_template
=
'{name}=%r'
_field_template
=
'''
\
{name} = _property(_itemgetter({index:d}), doc='Alias for field number {index:d}')
'''
def
namedtorchbatch
(
typename
:
str
,
field_names
:
List
[
str
],
verbose
:
bool
=
False
,
rename
:
bool
=
False
):
"""Returns a new subclass of tuple with named fields leveraging use of torch tensors.
"""
# Validate the field names. At the user's option, either generate an error
# message or automatically replace the field name with a valid name.
if
isinstance
(
field_names
,
str
):
field_names
=
field_names
.
replace
(
','
,
' '
).
split
()
field_names
=
list
(
map
(
str
,
field_names
))
if
rename
:
seen
:
set
=
set
()
for
index
,
name
in
enumerate
(
field_names
):
if
(
not
name
.
isidentifier
()
or
_iskeyword
(
name
)
or
name
.
startswith
(
'_'
)
or
name
in
seen
):
field_names
[
index
]
=
'_%d'
%
index
seen
.
add
(
name
)
for
name
in
[
typename
]
+
field_names
:
if
not
name
.
isidentifier
():
raise
ValueError
(
'Type names and field names must be valid '
'identifiers: %r'
%
name
)
if
_iskeyword
(
name
):
raise
ValueError
(
'Type names and field names cannot be a '
'keyword: %r'
%
name
)
seen
=
set
()
for
name
in
field_names
:
if
name
.
startswith
(
'_'
)
and
not
rename
:
raise
ValueError
(
'Field names cannot start with an underscore: '
'%r'
%
name
)
if
name
in
seen
:
raise
ValueError
(
'Encountered duplicate field name: %r'
%
name
)
seen
.
add
(
name
)
# Fill-in the class template
class_definition
=
_class_template
.
format
(
typename
=
typename
,
field_names
=
tuple
(
field_names
),
num_fields
=
len
(
field_names
),
arg_list
=
repr
(
tuple
(
field_names
)).
replace
(
"'"
,
""
)[
1
:
-
1
],
repr_fmt
=
', '
.
join
(
_repr_template
.
format
(
name
=
name
)
for
name
in
field_names
),
field_defs
=
'
\n
'
.
join
(
_field_template
.
format
(
index
=
index
,
name
=
name
)
for
index
,
name
in
enumerate
(
field_names
)))
# Execute the template string in a temporary namespace and support
# tracing utilities by setting a value for frame.f_globals['__name__']
namespace
=
dict
(
__name__
=
'namedtuple_%s'
%
typename
)
exec
(
class_definition
,
namespace
)
result
=
namespace
[
typename
]
result
.
_source
=
class_definition
# type: ignore
if
verbose
:
print
(
result
.
_source
)
# type: ignore
# For pickling to work, the __module__ variable needs to be set to the frame
# where the named tuple is created. Bypass this step in enviroments where
# sys._getframe is not defined (Jython for example) or sys._getframe is not
# defined for arguments greater than 0 (IronPython).
try
:
result
.
__module__
=
_sys
.
_getframe
(
1
).
f_globals
.
get
(
'__name__'
,
'__main__'
)
except
(
AttributeError
,
ValueError
):
pass
return
result
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
0 → 100644
View file @
31258341
{
"train_batch_size"
:
96
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
100
,
"optimizer"
:
{
"type"
:
"OnebitAdam"
,
"params"
:
{
"lr"
:
3e-5
,
"freeze_step"
:
400
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
,
"cuda_aware"
:
false
,
"comm_backend_name"
:
"mpi"
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
0 → 100644
View file @
31258341
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD
=
0
NCCL_IB_DISABLE
=
1
NCCL_SOCKET_IFNAME
=
eth0 deepspeed
--launcher
=
openmpi ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
0 → 100644
View file @
31258341
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
mpirun
-n
$NGPU
-npernode
$NGPU_PER_NODE
-hostfile
/job/hostfile
-x
UCX_TLS
=
tcp
--mca
btl ^openib
--mca
btl_tcp_if_include eth0
-x
NCCL_TREE_THRESHOLD
=
0
-x
NCCL_IB_DISABLE
=
1
-x
NCCL_SOCKET_IFNAME
=
eth0 python ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
0 → 100644
View file @
31258341
{
"train_batch_size"
:
96
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
100
,
"optimizer"
:
{
"type"
:
"OnebitAdam"
,
"params"
:
{
"lr"
:
3e-5
,
"freeze_step"
:
400
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
,
"cuda_aware"
:
true
,
"comm_backend_name"
:
"mpi"
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
0 → 100644
View file @
31258341
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
NCCL_TREE_THRESHOLD
=
0 deepspeed
--launcher
=
mvapich ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
0 → 100644
View file @
31258341
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
mpirun
-n
$NGPU
-ppn
$NGPU_PER_NODE
-f
/tmp/deepspeed_mvapich_hostfile
-env
MV2_SUPPORT_DL
=
1
-env
MV2_USE_GDR
=
0
-env
MV2_USE_CUDA
=
1
-env
MV2_USE_GDRCOPY
=
0
-env
MV2_SMP_USE_CMA
=
0
-env
MV2_DEBUG_SHOW_BACKTRACE
=
1 python ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
0 → 100644
View file @
31258341
{
"train_batch_size"
:
96
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
100
,
"optimizer"
:
{
"type"
:
"OnebitAdam"
,
"params"
:
{
"lr"
:
3e-5
,
"freeze_step"
:
400
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
,
"cuda_aware"
:
false
,
"comm_backend_name"
:
"nccl"
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
0 → 100644
View file @
31258341
# This script requires pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs).
# Read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD
=
0
NCCL_IB_DISABLE
=
1
NCCL_SOCKET_IFNAME
=
eth0 deepspeed ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/NOTICE.txt
0 → 100644
View file @
31258341
NOTICES AND INFORMATION
Do Not Translate or Localize
This software incorporates material from third parties. Microsoft makes certain
open source code available at https://3rdpartysource.microsoft.com, or you may
send a check or money order for US $5.00, including the product name, the open
source component name, and version number, to:
Source Code Compliance Team
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052
USA
Notwithstanding any other terms, you may reverse engineer this software to the
extent required to debug changes to any libraries licensed under the GNU Lesser
General Public License.
Component. BingBertSquad
Open Source License/Copyright Notice.
Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Deepspeed/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json
0 → 100644
View file @
31258341
{
"architectures"
:
[
"BertForMaskedLM"
],
"attention_probs_dropout_prob"
:
0.1
,
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-12
,
"max_position_embeddings"
:
512
,
"model_type"
:
"bert"
,
"num_attention_heads"
:
16
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"type_vocab_size"
:
2
,
"vocab_size"
:
30522
}
Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
0 → 100644
View file @
31258341
# coding=utf-8
# This script references to below file from HuggingFace:
# https://github.com/huggingface/transformers/blob/d541938/src/transformers/modeling_bert.py
#
# It converts Tensorflow and Huggingface checkpoint files to DeepSpeed.
import
os
import
argparse
import
logging
import
torch
import
re
import
numpy
as
np
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
set_data
(
param
,
array
):
try
:
assert
param
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
param
.
shape
,
array
.
shape
)
raise
param
.
data
=
torch
.
from_numpy
(
array
)
def
load_tf_weights_in_bert_kernel
(
model
,
ckpt_path
,
voc_size_diff
):
""" Load tf checkpoints in DeepSpeed model.
"""
try
:
import
re
import
numpy
as
np
import
tensorflow
as
tf
except
ImportError
:
logger
.
error
(
"Loading a TensorFlow model in DeepSpeed, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path
=
os
.
path
.
abspath
(
ckpt_path
)
logger
.
info
(
"Converting TensorFlow checkpoint from {}"
.
format
(
tf_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
logger
.
info
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
qkv
=
{}
for
name_str
,
array
in
zip
(
names
,
arrays
):
name
=
name_str
.
split
(
"/"
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
,
"AdamWeightDecayOptimizer"
,
"AdamWeightDecayOptimizer_1"
,
"global_step"
]
for
n
in
name
):
logger
.
info
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
key
=
None
skipping
=
False
for
m_name
in
name
:
if
re
.
fullmatch
(
r
"[A-Za-z]+_\d+"
,
m_name
):
scope_names
=
re
.
split
(
r
"_(\d+)"
,
m_name
)
else
:
scope_names
=
[
m_name
]
if
scope_names
[
0
]
==
"kernel"
or
scope_names
[
0
]
==
"gamma"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
scope_names
[
0
]
==
"output_bias"
or
scope_names
[
0
]
==
"beta"
:
pointer
=
getattr
(
pointer
,
"bias"
)
elif
scope_names
[
0
]
==
"output_weights"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
scope_names
[
0
]
==
"squad"
:
pointer
=
getattr
(
pointer
,
"classifier"
)
# Special in deepspeed.
elif
name_str
.
find
(
"bert/pooler/dense"
)
>=
0
and
scope_names
[
0
]
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
elif
name_str
.
find
(
"bert/embeddings/LayerNorm/gamma"
)
>=
0
and
scope_names
[
0
]
==
"gamma"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
name_str
.
find
(
"bert/embeddings/LayerNorm/beta"
)
>=
0
and
scope_names
[
0
]
==
"beta"
:
pointer
=
getattr
(
pointer
,
"bias"
)
else
:
try
:
pointer
=
getattr
(
pointer
,
scope_names
[
0
])
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
skipping
=
True
break
if
len
(
scope_names
)
>=
2
:
num
=
int
(
scope_names
[
1
])
pointer
=
pointer
[
num
]
# For transofrmer kernel layers.
if
scope_names
[
0
]
==
'layer'
:
if
name_str
.
find
(
"attention/self/query/kernel"
)
>
0
:
key
=
"qw"
elif
name_str
.
find
(
"attention/self/query/bias"
)
>
0
:
key
=
"qb"
elif
name_str
.
find
(
"attention/self/key/kernel"
)
>
0
:
key
=
"kw"
elif
name_str
.
find
(
"attention/self/key/bias"
)
>
0
:
key
=
"kb"
elif
name_str
.
find
(
"attention/self/value/kernel"
)
>
0
:
key
=
"vw"
elif
name_str
.
find
(
"attention/self/value/bias"
)
>
0
:
key
=
"vb"
elif
name_str
.
find
(
"attention/output/dense/kernel"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ow"
)
elif
name_str
.
find
(
"attention/output/dense/bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ob"
)
elif
name_str
.
find
(
"attention/output/LayerNorm/gamma"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nw"
)
elif
name_str
.
find
(
"attention/output/LayerNorm/beta"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nb"
)
elif
name_str
.
find
(
"intermediate/dense/kernel"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_w"
)
elif
name_str
.
find
(
"intermediate/dense/bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_b"
)
elif
name_str
.
find
(
"output/dense/kernel"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_w"
)
elif
name_str
.
find
(
"output/dense/bias"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_b"
)
elif
name_str
.
find
(
"output/LayerNorm/gamma"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_w"
)
elif
name_str
.
find
(
"output/LayerNorm/beta"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_b"
)
else
:
raise
ValueError
(
f
"unexpect scope name
{
name_str
}
in transformer layer."
)
break
if
skipping
:
continue
if
m_name
[
-
11
:]
==
"_embeddings"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
"kernel"
in
name
:
array
=
np
.
transpose
(
array
)
if
key
is
not
None
:
qkv
[
key
]
=
array
if
all
(
k
in
qkv
for
k
in
(
"qw"
,
"kw"
,
"vw"
)):
array
=
np
.
concatenate
((
qkv
[
"qw"
],
qkv
[
"kw"
],
qkv
[
"vw"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvw"
)
qkv
.
pop
(
"qw"
)
qkv
.
pop
(
"kw"
)
qkv
.
pop
(
"vw"
)
elif
all
(
k
in
qkv
for
k
in
(
"qb"
,
"kb"
,
"vb"
)):
array
=
np
.
concatenate
((
qkv
[
"qb"
],
qkv
[
"kb"
],
qkv
[
"vb"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvb"
)
qkv
.
pop
(
"qb"
)
qkv
.
pop
(
"kb"
)
qkv
.
pop
(
"vb"
)
elif
key
is
not
None
:
# For Q/K/V weight/bias in TF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if
voc_size_diff
>
0
and
name_str
.
find
(
"embeddings/word_embeddings"
)
>=
0
:
z
=
np
.
zeros
((
voc_size_diff
,
array
.
shape
[
1
]),
dtype
=
array
.
dtype
)
array
=
np
.
concatenate
((
array
,
z
),
axis
=
0
)
set_data
(
pointer
,
array
)
logger
.
info
(
"Initialize DeepSpeed weight {}"
.
format
(
name
))
return
model
def
load_hf_weights_in_bert_kernel
(
model
,
ckpt_path
,
voc_size_diff
):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path
=
os
.
path
.
abspath
(
ckpt_path
)
logger
.
info
(
"Converting Huggingface checkpoint from {}"
.
format
(
hf_path
))
# Load weights from Huggingface model
ckpt
=
torch
.
load
(
hf_path
,
map_location
=
torch
.
device
(
"cpu"
))
qkv
=
{}
for
name_str
in
ckpt
.
keys
():
array
=
ckpt
[
name_str
].
numpy
()
logger
.
info
(
"Loading Huggingface weight {} with shape {}"
.
format
(
name_str
,
array
.
shape
))
name
=
name_str
.
split
(
"."
)
pointer
=
model
key
=
None
is_layer
=
False
skipping
=
False
for
m_name
in
name
:
# Special in deepspeed.
if
name_str
.
find
(
"bert.pooler.dense"
)
>=
0
and
m_name
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
elif
is_layer
:
pass
else
:
try
:
pointer
=
getattr
(
pointer
,
m_name
)
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"."
.
join
(
name
)))
skipping
=
True
break
if
m_name
==
"layer"
:
is_layer
=
True
continue
if
m_name
.
isnumeric
()
and
is_layer
:
num
=
int
(
m_name
)
pointer
=
pointer
[
num
]
is_layer
=
False
# For transofrmer kernel layers.
if
name_str
.
find
(
"attention.self.query.weight"
)
>
0
:
key
=
"qw"
elif
name_str
.
find
(
"attention.self.query.bias"
)
>
0
:
key
=
"qb"
elif
name_str
.
find
(
"attention.self.key.weight"
)
>
0
:
key
=
"kw"
elif
name_str
.
find
(
"attention.self.key.bias"
)
>
0
:
key
=
"kb"
elif
name_str
.
find
(
"attention.self.value.weight"
)
>
0
:
key
=
"vw"
elif
name_str
.
find
(
"attention.self.value.bias"
)
>
0
:
key
=
"vb"
elif
name_str
.
find
(
"attention.output.dense.weight"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ow"
)
elif
name_str
.
find
(
"attention.output.dense.bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ob"
)
elif
name_str
.
find
(
"attention.output.LayerNorm.weight"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nw"
)
elif
name_str
.
find
(
"attention.output.LayerNorm.bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nb"
)
elif
name_str
.
find
(
"intermediate.dense.weight"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_w"
)
elif
name_str
.
find
(
"intermediate.dense.bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_b"
)
elif
name_str
.
find
(
"output.dense.weight"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_w"
)
elif
name_str
.
find
(
"output.dense.bias"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_b"
)
elif
name_str
.
find
(
"output.LayerNorm.weight"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_w"
)
elif
name_str
.
find
(
"output.LayerNorm.bias"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_b"
)
else
:
raise
ValueError
(
f
"unexpect scope name
{
name_str
}
in transformer layer."
)
break
if
skipping
:
continue
if
key
is
not
None
:
qkv
[
key
]
=
array
if
all
(
k
in
qkv
for
k
in
(
"qw"
,
"kw"
,
"vw"
)):
array
=
np
.
concatenate
((
qkv
[
"qw"
],
qkv
[
"kw"
],
qkv
[
"vw"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvw"
)
qkv
.
pop
(
"qw"
)
qkv
.
pop
(
"kw"
)
qkv
.
pop
(
"vw"
)
elif
all
(
k
in
qkv
for
k
in
(
"qb"
,
"kb"
,
"vb"
)):
array
=
np
.
concatenate
((
qkv
[
"qb"
],
qkv
[
"kb"
],
qkv
[
"vb"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvb"
)
qkv
.
pop
(
"qb"
)
qkv
.
pop
(
"kb"
)
qkv
.
pop
(
"vb"
)
elif
key
is
not
None
:
# For Q/K/V weight/bias in HF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if
voc_size_diff
>
0
and
name_str
.
find
(
"embeddings.word_embeddings"
)
>=
0
:
z
=
np
.
zeros
((
voc_size_diff
,
array
.
shape
[
1
]),
dtype
=
array
.
dtype
)
array
=
np
.
concatenate
((
array
,
z
),
axis
=
0
)
set_data
(
pointer
,
array
)
logger
.
info
(
"Initialize DeepSpeed weight {}"
.
format
(
name
))
return
model
def
load_hf_weights_in_bert_torch
(
model
,
ckpt_path
,
voc_size_diff
):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path
=
os
.
path
.
abspath
(
ckpt_path
)
logger
.
info
(
"Converting Huggingface checkpoint from {}"
.
format
(
hf_path
))
# Load weights from Huggingface model
ckpt
=
torch
.
load
(
hf_path
,
map_location
=
torch
.
device
(
"cpu"
))
qkv
=
{}
for
name_str
in
ckpt
.
keys
():
array
=
ckpt
[
name_str
].
numpy
()
logger
.
info
(
"Loading Huggingface weight {} with shape {}"
.
format
(
name_str
,
array
.
shape
))
name
=
name_str
.
split
(
"."
)
pointer
=
model
key
=
None
is_layer
=
False
skipping
=
False
for
m_name
in
name
:
# Special in deepspeed.
if
name_str
.
find
(
"intermediate.dense"
)
>=
0
and
m_name
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
elif
name_str
.
find
(
"pooler.dense"
)
>=
0
and
m_name
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
else
:
try
:
pointer
=
getattr
(
pointer
,
m_name
)
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"."
.
join
(
name
)))
skipping
=
True
break
if
skipping
:
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if
voc_size_diff
>
0
and
name_str
.
find
(
"embeddings.word_embeddings"
)
>=
0
:
z
=
np
.
zeros
((
voc_size_diff
,
array
.
shape
[
1
]),
dtype
=
array
.
dtype
)
array
=
np
.
concatenate
((
array
,
z
),
axis
=
0
)
set_data
(
pointer
,
array
)
logger
.
info
(
"Initialize DeepSpeed weight {}"
.
format
(
name
))
return
model
def
convert_ckpt_to_deepspeed
(
model
,
ckpt_type
,
ckpt_path
,
vocab_diff
,
kernel_enabled
):
# Load weights from checkpoint
if
ckpt_type
==
"HF"
:
if
kernel_enabled
:
load_hf_weights_in_bert_kernel
(
model
,
ckpt_path
,
vocab_diff
)
else
:
load_hf_weights_in_bert_torch
(
model
,
ckpt_path
,
vocab_diff
)
elif
ckpt_type
==
"TF"
:
if
kernel_enabled
:
load_tf_weights_in_bert_kernel
(
model
,
ckpt_path
,
vocab_diff
)
else
:
raise
ValueError
(
"--deepspeed_transformer_kernel is required for loading TF checkpoint."
)
else
:
raise
ValueError
(
f
"Invalid ckpt_type."
)
Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
0 → 100644
View file @
31258341
{
"train_batch_size"
:
24
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
10
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
3e-5
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/evaluate-v1.1.py
0 → 100644
View file @
31258341
import
argparse
import
json
import
evaluate
as
eval
if
__name__
==
'__main__'
:
expected_version
=
'1.1'
parser
=
argparse
.
ArgumentParser
(
description
=
'Evaluation for SQuAD '
+
expected_version
)
parser
.
add_argument
(
'dataset_file'
,
help
=
'Dataset file'
)
parser
.
add_argument
(
'prediction_file'
,
help
=
'Prediction File'
)
args
=
parser
.
parse_args
()
print
(
json
.
dumps
(
eval
.
evaluate
(
expected_version
,
args
.
dataset_file
,
args
.
prediction_file
)))
Deepspeed/BingBertSquad/evaluate.py
0 → 100644
View file @
31258341
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from
__future__
import
print_function
from
collections
import
Counter
import
string
import
re
import
argparse
import
json
import
sys
def
normalize_answer
(
s
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
remove_articles
(
text
):
return
re
.
sub
(
r
'\b(a|an|the)\b'
,
' '
,
text
)
def
white_space_fix
(
text
):
return
' '
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
return
''
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
def
f1_score
(
prediction
,
ground_truth
):
prediction_tokens
=
normalize_answer
(
prediction
).
split
()
ground_truth_tokens
=
normalize_answer
(
ground_truth
).
split
()
common
=
Counter
(
prediction_tokens
)
&
Counter
(
ground_truth_tokens
)
num_same
=
sum
(
common
.
values
())
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
prediction_tokens
)
recall
=
1.0
*
num_same
/
len
(
ground_truth_tokens
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
def
exact_match_score
(
prediction
,
ground_truth
):
return
(
normalize_answer
(
prediction
)
==
normalize_answer
(
ground_truth
))
def
metric_max_over_ground_truths
(
metric_fn
,
prediction
,
ground_truths
):
scores_for_ground_truths
=
[]
for
ground_truth
in
ground_truths
:
score
=
metric_fn
(
prediction
,
ground_truth
)
scores_for_ground_truths
.
append
(
score
)
return
max
(
scores_for_ground_truths
)
def
evaluate
(
expected_version
,
ds_file
,
pred_file
):
with
open
(
ds_file
)
as
dataset_file
:
dataset_json
=
json
.
load
(
dataset_file
)
if
(
dataset_json
[
'version'
]
!=
expected_version
):
print
(
'Evaluation expects v-'
+
expected_version
+
', but got dataset with v-'
+
dataset_json
[
'version'
],
file
=
sys
.
stderr
)
dataset
=
dataset_json
[
'data'
]
with
open
(
pred_file
)
as
prediction_file
:
predictions
=
json
.
load
(
prediction_file
)
f1
=
exact_match
=
total
=
0
for
article
in
dataset
:
for
paragraph
in
article
[
'paragraphs'
]:
for
qa
in
paragraph
[
'qas'
]:
total
+=
1
if
qa
[
'id'
]
not
in
predictions
:
message
=
'Unanswered question '
+
qa
[
'id'
]
+
\
' will receive score 0.'
print
(
message
,
file
=
sys
.
stderr
)
continue
ground_truths
=
list
(
map
(
lambda
x
:
x
[
'text'
],
qa
[
'answers'
]))
prediction
=
predictions
[
qa
[
'id'
]]
exact_match
+=
metric_max_over_ground_truths
(
exact_match_score
,
prediction
,
ground_truths
)
f1
+=
metric_max_over_ground_truths
(
f1_score
,
prediction
,
ground_truths
)
exact_match
=
100.0
*
exact_match
/
total
f1
=
100.0
*
f1
/
total
return
{
'exact_match'
:
exact_match
,
'f1'
:
f1
}
Prev
1
2
3
4
5
6
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment