Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
316d3f90
Commit
316d3f90
authored
Jul 14, 2022
by
Pan,Huiwen
Browse files
增加ds框架测试模型
parent
aebde649
Changes
227
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1972 additions
and
0 deletions
+1972
-0
Deepspeed/BingBertGlue/turing/file_utils.py
Deepspeed/BingBertGlue/turing/file_utils.py
+256
-0
Deepspeed/BingBertGlue/turing/logger.py
Deepspeed/BingBertGlue/turing/logger.py
+21
-0
Deepspeed/BingBertGlue/turing/loss.py
Deepspeed/BingBertGlue/turing/loss.py
+60
-0
Deepspeed/BingBertGlue/turing/models.py
Deepspeed/BingBertGlue/turing/models.py
+163
-0
Deepspeed/BingBertGlue/turing/sources.py
Deepspeed/BingBertGlue/turing/sources.py
+509
-0
Deepspeed/BingBertGlue/turing/text.py
Deepspeed/BingBertGlue/turing/text.py
+11
-0
Deepspeed/BingBertGlue/turing/utils.py
Deepspeed/BingBertGlue/turing/utils.py
+169
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
..._adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
+20
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
...1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
+60
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
...Squad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
+60
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
...dam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
+20
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
...bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
+59
-0
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
...uad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
+59
-0
Deepspeed/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
...ad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
+20
-0
Deepspeed/BingBertSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
...rtSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
+56
-0
Deepspeed/BingBertSquad/NOTICE.txt
Deepspeed/BingBertSquad/NOTICE.txt
+36
-0
Deepspeed/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json
...ad/ckpt/bert-large-uncased-whole-word-masking-config.json
+19
-0
Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
+340
-0
Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
+18
-0
Deepspeed/BingBertSquad/evaluate-v1.1.py
Deepspeed/BingBertSquad/evaluate-v1.1.py
+16
-0
No files found.
Too many changes to show.
To preserve performance only
227 of 227+
files are displayed.
Plain diff
Email patch
Deepspeed/BingBertGlue/turing/file_utils.py
0 → 100644
View file @
316d3f90
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
json
import
logging
import
os
import
shutil
import
tempfile
from
functools
import
wraps
from
hashlib
import
sha256
import
sys
from
io
import
open
import
boto3
import
requests
from
botocore.exceptions
import
ClientError
from
tqdm
import
tqdm
try
:
from
urllib.parse
import
urlparse
except
ImportError
:
from
urlparse
import
urlparse
try
:
from
pathlib
import
Path
PYTORCH_PRETRAINED_BERT_CACHE
=
Path
(
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
Path
.
home
()
/
'.pytorch_pretrained_bert'
))
except
AttributeError
:
PYTORCH_PRETRAINED_BERT_CACHE
=
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
os
.
path
.
join
(
os
.
path
.
expanduser
(
"~"
),
'.pytorch_pretrained_bert'
))
logger
=
logging
.
getLogger
(
__name__
)
# pylint: disable=invalid-name
def
url_to_filename
(
url
,
etag
=
None
):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes
=
url
.
encode
(
'utf-8'
)
url_hash
=
sha256
(
url_bytes
)
filename
=
url_hash
.
hexdigest
()
if
etag
:
etag_bytes
=
etag
.
encode
(
'utf-8'
)
etag_hash
=
sha256
(
etag_bytes
)
filename
+=
'.'
+
etag_hash
.
hexdigest
()
return
filename
def
filename_to_url
(
filename
,
cache_dir
=
None
):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
cache_path
=
os
.
path
.
join
(
cache_dir
,
filename
)
if
not
os
.
path
.
exists
(
cache_path
):
raise
EnvironmentError
(
"file {} not found"
.
format
(
cache_path
))
meta_path
=
cache_path
+
'.json'
if
not
os
.
path
.
exists
(
meta_path
):
raise
EnvironmentError
(
"file {} not found"
.
format
(
meta_path
))
with
open
(
meta_path
,
encoding
=
"utf-8"
)
as
meta_file
:
metadata
=
json
.
load
(
meta_file
)
url
=
metadata
[
'url'
]
etag
=
metadata
[
'etag'
]
return
url
,
etag
def
cached_path
(
url_or_filename
,
cache_dir
=
None
):
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
url_or_filename
,
Path
):
url_or_filename
=
str
(
url_or_filename
)
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
parsed
=
urlparse
(
url_or_filename
)
if
parsed
.
scheme
in
(
'http'
,
'https'
,
's3'
):
# URL, so get it from the cache (downloading if necessary)
return
get_from_cache
(
url_or_filename
,
cache_dir
)
elif
os
.
path
.
exists
(
url_or_filename
):
# File, and it exists.
return
url_or_filename
elif
parsed
.
scheme
==
''
:
# File, but it doesn't exist.
raise
EnvironmentError
(
"file {} not found"
.
format
(
url_or_filename
))
else
:
# Something unknown
raise
ValueError
(
"unable to parse {} as a URL or as a local path"
.
format
(
url_or_filename
))
def
split_s3_path
(
url
):
"""Split a full s3 path into the bucket name and path."""
parsed
=
urlparse
(
url
)
if
not
parsed
.
netloc
or
not
parsed
.
path
:
raise
ValueError
(
"bad s3 path {}"
.
format
(
url
))
bucket_name
=
parsed
.
netloc
s3_path
=
parsed
.
path
# Remove '/' at beginning of path.
if
s3_path
.
startswith
(
"/"
):
s3_path
=
s3_path
[
1
:]
return
bucket_name
,
s3_path
def
s3_request
(
func
):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@
wraps
(
func
)
def
wrapper
(
url
,
*
args
,
**
kwargs
):
try
:
return
func
(
url
,
*
args
,
**
kwargs
)
except
ClientError
as
exc
:
if
int
(
exc
.
response
[
"Error"
][
"Code"
])
==
404
:
raise
EnvironmentError
(
"file {} not found"
.
format
(
url
))
else
:
raise
return
wrapper
@
s3_request
def
s3_etag
(
url
):
"""Check ETag on S3 object."""
s3_resource
=
boto3
.
resource
(
"s3"
)
bucket_name
,
s3_path
=
split_s3_path
(
url
)
s3_object
=
s3_resource
.
Object
(
bucket_name
,
s3_path
)
return
s3_object
.
e_tag
@
s3_request
def
s3_get
(
url
,
temp_file
):
"""Pull a file directly from S3."""
s3_resource
=
boto3
.
resource
(
"s3"
)
bucket_name
,
s3_path
=
split_s3_path
(
url
)
s3_resource
.
Bucket
(
bucket_name
).
download_fileobj
(
s3_path
,
temp_file
)
def
http_get
(
url
,
temp_file
):
req
=
requests
.
get
(
url
,
stream
=
True
)
content_length
=
req
.
headers
.
get
(
'Content-Length'
)
total
=
int
(
content_length
)
if
content_length
is
not
None
else
None
progress
=
tqdm
(
unit
=
"B"
,
total
=
total
)
for
chunk
in
req
.
iter_content
(
chunk_size
=
1024
):
if
chunk
:
# filter out keep-alive new chunks
progress
.
update
(
len
(
chunk
))
temp_file
.
write
(
chunk
)
progress
.
close
()
def
get_from_cache
(
url
,
cache_dir
=
None
):
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
if
not
os
.
path
.
exists
(
cache_dir
):
os
.
makedirs
(
cache_dir
)
# Get eTag to add to filename, if it exists.
if
url
.
startswith
(
"s3://"
):
etag
=
s3_etag
(
url
)
else
:
response
=
requests
.
head
(
url
,
allow_redirects
=
True
)
if
response
.
status_code
!=
200
:
raise
IOError
(
"HEAD request failed for url {} with status code {}"
.
format
(
url
,
response
.
status_code
))
etag
=
response
.
headers
.
get
(
"ETag"
)
filename
=
url_to_filename
(
url
,
etag
)
# get cache path to put the file
cache_path
=
os
.
path
.
join
(
cache_dir
,
filename
)
if
not
os
.
path
.
exists
(
cache_path
):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with
tempfile
.
NamedTemporaryFile
()
as
temp_file
:
logger
.
info
(
"%s not found in cache, downloading to %s"
,
url
,
temp_file
.
name
)
# GET file object
if
url
.
startswith
(
"s3://"
):
s3_get
(
url
,
temp_file
)
else
:
http_get
(
url
,
temp_file
)
# we are copying the file before closing it, so flush to avoid truncation
temp_file
.
flush
()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file
.
seek
(
0
)
logger
.
info
(
"copying %s to cache at %s"
,
temp_file
.
name
,
cache_path
)
with
open
(
cache_path
,
'wb'
)
as
cache_file
:
shutil
.
copyfileobj
(
temp_file
,
cache_file
)
logger
.
info
(
"creating metadata file for %s"
,
cache_path
)
meta
=
{
'url'
:
url
,
'etag'
:
etag
}
meta_path
=
cache_path
+
'.json'
with
open
(
meta_path
,
'w'
,
encoding
=
"utf-8"
)
as
meta_file
:
json
.
dump
(
meta
,
meta_file
)
logger
.
info
(
"removing temp file %s"
,
temp_file
.
name
)
return
cache_path
def
read_set_from_file
(
filename
):
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection
=
set
()
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
file_
:
for
line
in
file_
:
collection
.
add
(
line
.
rstrip
())
return
collection
def
get_file_extension
(
path
,
dot
=
True
,
lower
=
True
):
ext
=
os
.
path
.
splitext
(
path
)[
1
]
ext
=
ext
if
dot
else
ext
[
1
:]
return
ext
.
lower
()
if
lower
else
ext
Deepspeed/BingBertGlue/turing/logger.py
0 → 100644
View file @
316d3f90
import
logging
import
torch.distributed
as
dist
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
class
Logger
():
def
__init__
(
self
,
cuda
=
False
):
self
.
logger
=
logging
.
getLogger
(
__name__
)
self
.
cuda
=
cuda
def
info
(
self
,
message
,
*
args
,
**
kwargs
):
if
(
self
.
cuda
and
dist
.
get_rank
()
==
0
)
or
not
self
.
cuda
:
self
.
logger
.
info
(
message
,
*
args
,
**
kwargs
)
def
error
(
self
,
message
,
*
args
,
**
kwargs
):
self
.
logger
.
error
(
message
,
*
args
,
**
kwargs
)
Deepspeed/BingBertGlue/turing/loss.py
0 → 100644
View file @
316d3f90
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.autograd
import
Variable
class
FocalLoss
(
nn
.
Module
):
r
"""
This criterion is a implemenation of Focal Loss, which is proposed in
Focal Loss for Dense Object Detection.
Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
The losses are averaged across observations for each minibatch.
Args:
alpha(1D Tensor, Variable) : the scalar factor for this criterion
gamma(float, double) : gamma > 0; reduces the relative loss for well-classified examples (p > .5),
putting more focus on hard, misclassified examples
size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
However, if the field size_average is set to False, the losses are
instead summed for each minibatch.
"""
def
__init__
(
self
,
class_num
,
alpha
=
None
,
gamma
=
2
,
size_average
=
True
):
super
(
FocalLoss
,
self
).
__init__
()
if
alpha
is
None
:
self
.
alpha
=
torch
.
ones
(
class_num
,
1
)
else
:
if
isinstance
(
alpha
,
Variable
):
self
.
alpha
=
alpha
else
:
self
.
alpha
=
Variable
(
alpha
)
self
.
gamma
=
gamma
self
.
class_num
=
class_num
self
.
size_average
=
size_average
def
forward
(
self
,
inputs
,
targets
):
N
=
inputs
.
size
(
0
)
C
=
inputs
.
size
(
1
)
P
=
F
.
softmax
(
inputs
)
class_mask
=
inputs
.
data
.
new
(
N
,
C
).
fill_
(
0
)
# class_mask = Variable(class_mask)
ids
=
targets
.
view
(
-
1
,
1
)
class_mask
.
scatter_
(
1
,
ids
.
data
,
1.
)
if
inputs
.
is_cuda
and
not
self
.
alpha
.
is_cuda
:
self
.
alpha
=
self
.
alpha
.
cuda
()
alpha
=
self
.
alpha
[
ids
.
data
.
view
(
-
1
)]
probs
=
(
P
*
class_mask
).
sum
(
1
).
view
(
-
1
,
1
)
log_p
=
probs
.
log
()
batch_loss
=
-
alpha
*
(
torch
.
pow
((
1
-
probs
),
self
.
gamma
))
*
log_p
if
self
.
size_average
:
loss
=
batch_loss
.
mean
()
else
:
loss
=
batch_loss
.
sum
()
return
loss
Deepspeed/BingBertGlue/turing/models.py
0 → 100644
View file @
316d3f90
import
torch
import
torch.nn
as
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
turing.utils
import
TorchTuple
from
pytorch_pretrained_bert.modeling
import
BertModel
from
pytorch_pretrained_bert.modeling
import
BertPreTrainingHeads
,
PreTrainedBertModel
,
BertPreTrainingHeads
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
class
BertPretrainingLoss
(
PreTrainedBertModel
):
def
__init__
(
self
,
bert_encoder
,
config
):
super
(
BertPretrainingLoss
,
self
).
__init__
(
config
)
self
.
bert
=
bert_encoder
self
.
cls
=
BertPreTrainingHeads
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
cls
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
next_sentence_label
=
None
):
sequence_output
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
if
masked_lm_labels
is
not
None
and
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
total_loss
=
masked_lm_loss
+
next_sentence_loss
return
total_loss
else
:
return
prediction_scores
,
seq_relationship_score
class
BertClassificationLoss
(
PreTrainedBertModel
):
def
__init__
(
self
,
bert_encoder
,
config
,
num_labels
:
int
=
1
):
super
(
BertClassificationLoss
,
self
).
__init__
(
config
)
self
.
bert
=
bert_encoder
self
.
num_labels
=
num_labels
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
classifier
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
scores
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
nn
.
BCEWithLogitsLoss
()
loss
=
loss_fct
(
scores
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
,
1
))
return
loss
else
:
return
scores
class
BertRegressionLoss
(
PreTrainedBertModel
):
def
__init__
(
self
,
bert_encoder
,
config
):
super
(
BertRegressionLoss
,
self
).
__init__
(
config
)
self
.
bert
=
bert_encoder
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
1
)
self
.
classifier
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
1
),
labels
.
view
(
-
1
,
1
))
return
loss
else
:
return
logits
class
BertMultiTask
:
def
__init__
(
self
,
args
):
self
.
config
=
args
.
config
if
not
args
.
use_pretrain
:
if
args
.
progressive_layer_drop
:
print
(
"BertConfigPreLnLayerDrop"
)
from
nvidia.modelingpreln_layerdrop
import
BertForPreTrainingPreLN
,
BertConfig
else
:
from
nvidia.modelingpreln
import
BertForPreTrainingPreLN
,
BertConfig
bert_config
=
BertConfig
(
**
self
.
config
[
"bert_model_config"
])
bert_config
.
vocab_size
=
len
(
args
.
tokenizer
.
vocab
)
# Padding for divisibility by 8
if
bert_config
.
vocab_size
%
8
!=
0
:
bert_config
.
vocab_size
+=
8
-
(
bert_config
.
vocab_size
%
8
)
print
(
"VOCAB SIZE:"
,
bert_config
.
vocab_size
)
self
.
network
=
BertForPreTrainingPreLN
(
bert_config
,
args
)
# Use pretrained bert weights
else
:
self
.
bert_encoder
=
BertModel
.
from_pretrained
(
self
.
config
[
'bert_model_file'
],
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
/
'distributed_{}'
.
format
(
args
.
local_rank
))
bert_config
=
self
.
bert_encoder
.
config
self
.
device
=
None
def
set_device
(
self
,
device
):
self
.
device
=
device
def
save
(
self
,
filename
:
str
):
network
=
self
.
network
.
module
return
torch
.
save
(
network
.
state_dict
(),
filename
)
def
load
(
self
,
model_state_dict
:
str
):
return
self
.
network
.
module
.
load_state_dict
(
torch
.
load
(
model_state_dict
,
map_location
=
lambda
storage
,
loc
:
storage
))
def
move_batch
(
self
,
batch
:
TorchTuple
,
non_blocking
=
False
):
return
batch
.
to
(
self
.
device
,
non_blocking
)
def
eval
(
self
):
self
.
network
.
eval
()
def
train
(
self
):
self
.
network
.
train
()
def
save_bert
(
self
,
filename
:
str
):
return
torch
.
save
(
self
.
bert_encoder
.
state_dict
(),
filename
)
def
to
(
self
,
device
):
assert
isinstance
(
device
,
torch
.
device
)
self
.
network
.
to
(
device
)
def
half
(
self
):
self
.
network
.
half
()
Deepspeed/BingBertGlue/turing/sources.py
0 → 100644
View file @
316d3f90
from
tqdm
import
tqdm
from
typing
import
Tuple
from
random
import
shuffle
import
pickle
import
random
import
numpy
as
np
from
pathlib
import
Path
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
def
truncate_input_sequence
(
tokens_a
,
tokens_b
,
max_num_tokens
):
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_num_tokens
:
break
trunc_tokens
=
tokens_a
if
len
(
tokens_a
)
>
len
(
tokens_b
)
else
tokens_b
assert
len
(
trunc_tokens
)
>=
1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if
random
.
random
()
<
0.5
:
del
trunc_tokens
[
0
]
else
:
trunc_tokens
.
pop
()
class
TokenInstance
:
""" This TokenInstance is a obect to have the basic units of data that should be
extracted from the raw text file and can be consumed by any BERT like model.
"""
def
__init__
(
self
,
tokens_a
,
tokens_b
,
is_next
,
lang
=
"en"
):
self
.
tokens_a
=
tokens_a
self
.
tokens_b
=
tokens_b
self
.
is_next
=
is_next
# 0 is if in continuation, 1 if is random
self
.
lang
=
lang
def
get_values
(
self
):
return
(
self
.
tokens_a
,
self
.
tokens_b
,
self
.
is_next
)
def
get_lang
(
self
):
return
self
.
lang
class
QueryPassageDataset
:
def
__init__
(
self
,
path
,
readin
=
20000000
):
all_pairs
=
[]
with
open
(
path
,
encoding
=
"utf-8"
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
qpl_tuple
:
Tuple
[
str
,
str
,
str
]
=
line
.
split
(
'
\t
'
)
all_pairs
.
append
(
qpl_tuple
)
if
i
>
readin
:
break
shuffle
(
all_pairs
)
self
.
all_pairs
=
all_pairs
self
.
len
=
len
(
self
.
all_pairs
)
def
__len__
(
self
):
return
self
.
len
class
QueryPassageFineTuningDataset
:
def
__init__
(
self
,
path
,
readin
=
20000000
):
all_pairs
=
[]
with
open
(
path
,
encoding
=
"utf-8"
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
entities
=
line
.
split
(
'
\t
'
)
qpl_tuple
:
Tuple
[
str
,
str
,
str
]
=
(
entities
[
0
],
entities
[
2
],
entities
[
4
])
all_pairs
.
append
(
qpl_tuple
)
if
i
>
readin
:
break
shuffle
(
all_pairs
)
self
.
all_pairs
=
all_pairs
self
.
len
=
len
(
self
.
all_pairs
)
def
__len__
(
self
):
return
self
.
len
class
QueryInstanceDataset
:
def
__init__
(
self
,
path
,
readin
=
20000000
):
all_pairs
=
[]
with
open
(
path
,
encoding
=
"utf-8"
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
qpl_tuple
:
Tuple
[
str
,
str
,
str
]
=
line
.
split
(
'
\t
'
)
all_pairs
.
append
(
qpl_tuple
)
if
i
>
readin
:
break
shuffle
(
all_pairs
)
self
.
all_pairs
=
all_pairs
self
.
len
=
len
(
self
.
all_pairs
)
def
__len__
(
self
):
return
self
.
len
class
PretrainingDataCreator
:
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
5
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
# Expected format (Q,T,U,S,D)
# query, title, url, snippet, document = line.split('\t')
# ! remove this following line later
document
=
line
if
len
(
document
.
split
(
"<sep>"
))
<=
3
:
continue
lines
=
document
.
split
(
"<sep>"
)
document
=
[]
for
seq
in
lines
:
document
.
append
(
tokenizer
.
tokenize
(
seq
))
# document = list(map(tokenizer.tokenize, lines))
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
def
__len__
(
self
):
return
self
.
len
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
return
state
def
__setstate__
(
self
,
state
):
self
.
__dict__
.
update
(
state
)
def
save
(
self
,
filename
):
with
open
(
filename
,
'wb'
)
as
outfile
:
pickle
.
dump
(
self
,
outfile
)
@
staticmethod
def
load
(
filename
):
with
open
(
filename
,
'rb'
)
as
f
:
return
pickle
.
load
(
f
)
def
create_training_instance
(
self
,
index
):
document
=
self
.
documents
[
index
]
# l = 0
# for s in document:
# l+=len(s)
# print(l)
# print(document)
# Need to add [CLS] + 2*[SEP] tokens
max_num_tokens
=
self
.
max_seq_length
-
3
# We want to maximize the inp sequence but also want inputs similar
# to our generic task inputs which will be compartively smaller
# than the data on which we intend to pre-train.
target_seq_length
=
max_num_tokens
if
random
.
random
()
<
self
.
small_seq_prob
:
target_seq_length
=
random
.
randint
(
5
,
max_num_tokens
)
# Need to make the sequences split for NSP task for interesting
# rather than choosing some arbitrary point. If not the NSP
# task might become way too easy.
instances
=
[]
current_chunk
=
[]
current_length
=
0
i
=
0
while
i
<
len
(
document
):
segment
=
document
[
i
]
current_chunk
.
append
(
segment
)
current_length
+=
len
(
segment
)
if
i
==
len
(
document
)
-
1
or
current_length
>=
target_seq_length
:
if
current_chunk
:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end
=
1
if
len
(
current_chunk
)
>=
2
:
a_end
=
random
.
randint
(
1
,
len
(
current_chunk
)
-
1
)
tokens_a
=
[]
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
current_chunk
[
j
])
tokens_b
=
[]
# Random Next
is_random_next
=
False
if
len
(
current_chunk
)
==
1
or
random
.
random
()
<
0.5
:
is_random_next
=
True
target_b_length
=
target_seq_length
-
len
(
tokens_a
)
# Pick a random document
for
_
in
range
(
10
):
random_doc_index
=
random
.
randint
(
0
,
len
(
self
.
documents
)
-
1
)
if
random_doc_index
!=
index
:
break
random_doc
=
self
.
documents
[
random_doc_index
]
random_start
=
random
.
randint
(
0
,
len
(
random_doc
)
-
1
)
for
j
in
range
(
random_start
,
len
(
random_doc
)):
tokens_b
.
extend
(
random_doc
[
j
])
if
len
(
tokens_b
)
>=
target_b_length
:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments
=
len
(
current_chunk
)
-
a_end
i
-=
num_unused_segments
# Actual Next
else
:
is_random_next
=
False
for
j
in
range
(
a_end
,
len
(
current_chunk
)):
tokens_b
.
extend
(
current_chunk
[
j
])
truncate_input_sequence
(
tokens_a
,
tokens_b
,
max_num_tokens
)
assert
len
(
tokens_a
)
>=
1
assert
len
(
tokens_b
)
>=
1
instances
.
append
(
TokenInstance
(
tokens_a
,
tokens_b
,
int
(
is_random_next
)))
# print(instances[-1])
current_chunk
=
[]
current_length
=
0
i
+=
1
# print(len(instances))
return
instances
class
CleanBodyDataCreator
(
PretrainingDataCreator
):
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
:
int
=
512
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
5
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
url
,
cleanbody
,
rand_int
=
line
.
rstrip
(
"
\n
"
).
split
(
"
\t
"
)
cleanbody
=
cleanbody
.
replace
(
"#TAB#"
,
" "
).
replace
(
"#NULL#"
,
""
).
replace
(
"#HASH#"
,
"#"
)
cleanbody_parts
=
cleanbody
.
split
(
"#R##N#"
)
for
document
in
cleanbody_parts
:
lines
=
document
.
split
(
"#N#"
)
document
=
[]
document_len
=
0
for
seq
in
lines
:
tok_seq
=
tokenizer
.
tokenize
(
seq
)
if
len
(
tok_seq
)
!=
0
:
document
.
append
(
tok_seq
)
document_len
+=
len
(
tok_seq
)
if
document_len
>=
200
:
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
class
WikiNBookCorpusPretrainingDataCreator
(
PretrainingDataCreator
):
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
:
int
=
512
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
6
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
document
=
[]
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if
len
(
line
)
==
0
:
# This is end of document
documents
.
append
(
document
)
document
=
[]
if
len
(
line
.
split
(
' '
))
>
2
:
document
.
append
(
tokenizer
.
tokenize
(
line
))
if
len
(
document
)
>
0
:
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
print
(
documents
[
0
])
print
(
len
(
documents
))
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
class
WikiPretrainingDataCreator
(
PretrainingDataCreator
):
def
__init__
(
self
,
path
,
tokenizer
:
BertTokenizer
,
max_seq_length
:
int
=
512
,
readin
:
int
=
2000000
,
dupe_factor
:
int
=
6
,
small_seq_prob
:
float
=
0.1
):
self
.
dupe_factor
=
dupe_factor
self
.
max_seq_length
=
max_seq_length
self
.
small_seq_prob
=
small_seq_prob
documents
=
[]
instances
=
[]
with
open
(
path
,
encoding
=
'utf-8'
)
as
fd
:
document
=
[]
for
i
,
line
in
enumerate
(
tqdm
(
fd
)):
line
=
line
.
replace
(
'
\n
'
,
''
)
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if
len
(
line
)
>
0
and
line
[:
2
]
==
"[["
:
# This is end of document
documents
.
append
(
document
)
document
=
[]
if
len
(
line
.
split
(
' '
))
>
2
:
document
.
append
(
tokenizer
.
tokenize
(
line
))
if
len
(
document
)
>
0
:
documents
.
append
(
document
)
documents
=
[
x
for
x
in
documents
if
x
]
# print(len(documents))
# print(len(documents[0]))
# print(documents[0][0:10])
self
.
documents
=
documents
for
_
in
range
(
self
.
dupe_factor
):
for
index
in
range
(
len
(
self
.
documents
)):
instances
.
extend
(
self
.
create_training_instance
(
index
))
shuffle
(
instances
)
self
.
instances
=
instances
self
.
len
=
len
(
self
.
instances
)
self
.
documents
=
None
documents
=
None
class
NumpyByteInstances
:
TOKEN_SEP_VAL
=
int
.
from_bytes
(
b
'
\x1f
'
,
byteorder
=
'big'
)
def
__init__
(
self
,
data_creator
):
self
.
data_creator
=
data_creator
self
.
getitem_fixed
=
self
.
sep_getitem_fixed
if
self
.
data_creator
.
use_separators
else
self
.
data_creator
.
nosep_getitem_fixed
# if self.data_creator.multilingual:
# self.__getitem__ = self.getitem_multilingual
# else:
# self.__getitem__ = self.getitem_monolingual
def
getitem_multilingual
(
self
,
i
):
tokens_a
,
tokens_b
,
is_next
=
self
.
getitem_fixed
(
i
)
return
TokenInstance
(
tokens_a
,
tokens_b
,
is_next
,
lang
=
self
.
data_creator
.
lang
[
i
])
def
getitem_monolingual
(
self
,
i
):
return
TokenInstance
(
*
self
.
getitem_fixed
(
i
))
def
__getitem__
(
self
,
i
):
if
self
.
data_creator
.
multilingual
:
return
self
.
getitem_multilingual
(
i
)
else
:
return
self
.
getitem_monolingual
(
i
)
def
nosep_getitem_fixed
(
self
,
i
):
if
i
>
self
.
data_creator
.
len
:
raise
IndexError
if
i
<
0
:
i
+=
self
.
data_creator
.
len
instance_start
,
instance_end
=
self
.
data_creator
.
instance_offsets
[
i
:
i
+
2
]
tok_offsets_start
,
tok_offsets_end
=
self
.
data_creator
.
instance_token_offsets
[
i
:
i
+
2
]
token_offsets
=
self
.
data_creator
.
token_offsets
[
tok_offsets_start
:
tok_offsets_end
]
tokens_split
=
self
.
data_creator
.
tokens_split
[
i
]
token_arrs
=
np
.
split
(
self
.
data_creator
.
data
[
instance_start
:
instance_end
],
token_offsets
)
tokens
=
[
t
.
tostring
().
decode
(
'utf8'
)
for
t
in
token_arrs
]
return
tokens
[:
tokens_split
],
tokens
[
tokens_split
:],
self
.
data_creator
.
is_next
[
i
]
def
sep_getitem_fixed
(
self
,
i
):
if
i
>
self
.
data_creator
.
len
:
raise
IndexError
if
i
<
0
:
i
+=
self
.
data_creator
.
len
instance_start
,
instance_end
=
self
.
data_creator
.
instance_offsets
[
i
:
i
+
2
]
instance_data
=
self
.
data_creator
.
data
[
instance_start
:
instance_end
]
tokens_split
=
self
.
data_creator
.
tokens_split
[
i
]
token_arrs
=
np
.
split
(
instance_data
,
np
.
where
(
instance_data
==
NumpyByteInstances
.
TOKEN_SEP_VAL
)
[
0
])
# split on the token separator
tokens
=
[
(
t
[
1
:]
if
i
>
0
else
t
).
tostring
().
decode
(
'utf8'
)
for
i
,
t
in
enumerate
(
token_arrs
)
]
# ignore first byte, which will be separator, for tokens after the first
return
tokens
[:
tokens_split
],
tokens
[
tokens_split
:],
self
.
data_creator
.
is_next
[
i
]
def
__len__
(
self
):
return
self
.
data_creator
.
len
class
NumpyPretrainingDataCreator
:
def
__init__
(
self
,
path
,
mmap
=
False
):
path
=
Path
(
path
)
self
.
path
=
path
mmap_mode
=
'r'
if
mmap
else
None
self
.
data
=
np
.
load
(
str
(
path
/
'data.npy'
),
mmap_mode
=
mmap_mode
)
self
.
is_next
=
np
.
load
(
str
(
path
/
'is_next.npy'
),
mmap_mode
=
mmap_mode
)
self
.
tokens_split
=
np
.
load
(
str
(
path
/
'tokens_split.npy'
),
mmap_mode
=
mmap_mode
)
self
.
instance_offsets
=
np
.
load
(
str
(
path
/
'instance_offsets.npy'
),
mmap_mode
=
mmap_mode
)
if
(
path
/
'instance_token_offsets.npy'
).
is_file
():
self
.
use_separators
=
False
self
.
instance_token_offsets
=
np
.
load
(
str
(
path
/
'instance_token_offsets.npy'
),
mmap_mode
=
mmap_mode
)
self
.
token_offsets
=
np
.
load
(
str
(
path
/
'token_offsets.npy'
),
mmap_mode
=
mmap_mode
)
else
:
self
.
use_separators
=
True
self
.
instance_token_offsets
=
None
self
.
token_offsets
=
None
if
(
path
/
'lang.npy'
).
is_file
():
self
.
multilingual
=
True
self
.
lang
=
np
.
load
(
str
(
path
/
'lang.npy'
),
mmap_mode
=
mmap_mode
)
else
:
self
.
multilingual
=
False
self
.
lang
=
None
self
.
instances
=
NumpyByteInstances
(
self
)
self
.
len
=
len
(
self
.
is_next
)
def
__len__
(
self
):
return
self
.
len
@
classmethod
def
load
(
cls
,
path
):
return
cls
(
path
)
Deepspeed/BingBertGlue/turing/text.py
0 → 100644
View file @
316d3f90
import
torch
PAD
=
0
def
mask
(
x
):
return
x
!=
PAD
def
torch_long
(
x
):
return
torch
.
LongTensor
(
x
)
Deepspeed/BingBertGlue/turing/utils.py
0 → 100644
View file @
316d3f90
import
sys
as
_sys
from
typing
import
List
from
collections
import
_iskeyword
# type: ignore
from
tensorboardX
import
SummaryWriter
import
os
SUMMARY_WRITER_DIR_NAME
=
'runs'
def
get_sample_writer
(
name
,
base
=
".."
):
"""Returns a tensorboard summary writer
"""
return
SummaryWriter
(
log_dir
=
os
.
path
.
join
(
base
,
SUMMARY_WRITER_DIR_NAME
,
name
))
class
TorchTuple
(
tuple
):
def
to
(
self
,
device
,
non_blocking
=
False
):
raise
NotImplementedError
(
""
)
_class_template
=
"""
\
from builtins import property as _property, tuple as _tuple
from operator import itemgetter as _itemgetter
from collections import OrderedDict
from turing.utils import TorchTuple
import torch
class {typename}(TorchTuple):
'{typename}({arg_list})'
__slots__ = ()
_fields = {field_names!r}
def __new__(_cls, {arg_list}):
'Create new instance of {typename}({arg_list})'
return _tuple.__new__(_cls, ({arg_list}))
@classmethod
def _make(cls, iterable, new=tuple.__new__, len=len):
'Make a new {typename} object from a sequence or iterable'
result = new(cls, iterable)
if len(result) != {num_fields:d}:
raise TypeError('Expected {num_fields:d} arguments, got %d' % len(result))
return result
def _replace(_self, **kwds):
'Return a new {typename} object replacing specified fields with new values'
result = _self._make(map(kwds.pop, {field_names!r}, _self))
if kwds:
raise ValueError('Got unexpected field names: %r' % list(kwds))
return result
def __repr__(self):
'Return a nicely formatted representation string'
return self.__class__.__name__ + '({repr_fmt})' % self
@property
def __dict__(self):
'A new OrderedDict mapping field names to their values'
return OrderedDict(zip(self._fields, self))
def _asdict(self):
'''Return a new OrderedDict which maps field names to their values.
This method is obsolete. Use vars(nt) or nt.__dict__ instead.
'''
return self.__dict__
def __getnewargs__(self):
'Return self as a plain tuple. Used by copy and pickle.'
return tuple(self)
def __getstate__(self):
'Exclude the OrderedDict from pickling'
return None
def to(self, device, non_blocking=False):
_dict = self.__dict__.copy()
new_dict = dict()
for key, value in _dict.items():
if isinstance(value, torch.Tensor):
if device.type != 'cpu' and non_blocking and torch.cuda.is_available():
new_dict[key] = value.cuda(device, non_blocking=non_blocking)
else:
new_dict[key] = value.to(device)
else:
new_dict[key] = value
return {typename}(**new_dict)
{field_defs}
"""
_repr_template
=
'{name}=%r'
_field_template
=
'''
\
{name} = _property(_itemgetter({index:d}), doc='Alias for field number {index:d}')
'''
def
namedtorchbatch
(
typename
:
str
,
field_names
:
List
[
str
],
verbose
:
bool
=
False
,
rename
:
bool
=
False
):
"""Returns a new subclass of tuple with named fields leveraging use of torch tensors.
"""
# Validate the field names. At the user's option, either generate an error
# message or automatically replace the field name with a valid name.
if
isinstance
(
field_names
,
str
):
field_names
=
field_names
.
replace
(
','
,
' '
).
split
()
field_names
=
list
(
map
(
str
,
field_names
))
if
rename
:
seen
:
set
=
set
()
for
index
,
name
in
enumerate
(
field_names
):
if
(
not
name
.
isidentifier
()
or
_iskeyword
(
name
)
or
name
.
startswith
(
'_'
)
or
name
in
seen
):
field_names
[
index
]
=
'_%d'
%
index
seen
.
add
(
name
)
for
name
in
[
typename
]
+
field_names
:
if
not
name
.
isidentifier
():
raise
ValueError
(
'Type names and field names must be valid '
'identifiers: %r'
%
name
)
if
_iskeyword
(
name
):
raise
ValueError
(
'Type names and field names cannot be a '
'keyword: %r'
%
name
)
seen
=
set
()
for
name
in
field_names
:
if
name
.
startswith
(
'_'
)
and
not
rename
:
raise
ValueError
(
'Field names cannot start with an underscore: '
'%r'
%
name
)
if
name
in
seen
:
raise
ValueError
(
'Encountered duplicate field name: %r'
%
name
)
seen
.
add
(
name
)
# Fill-in the class template
class_definition
=
_class_template
.
format
(
typename
=
typename
,
field_names
=
tuple
(
field_names
),
num_fields
=
len
(
field_names
),
arg_list
=
repr
(
tuple
(
field_names
)).
replace
(
"'"
,
""
)[
1
:
-
1
],
repr_fmt
=
', '
.
join
(
_repr_template
.
format
(
name
=
name
)
for
name
in
field_names
),
field_defs
=
'
\n
'
.
join
(
_field_template
.
format
(
index
=
index
,
name
=
name
)
for
index
,
name
in
enumerate
(
field_names
)))
# Execute the template string in a temporary namespace and support
# tracing utilities by setting a value for frame.f_globals['__name__']
namespace
=
dict
(
__name__
=
'namedtuple_%s'
%
typename
)
exec
(
class_definition
,
namespace
)
result
=
namespace
[
typename
]
result
.
_source
=
class_definition
# type: ignore
if
verbose
:
print
(
result
.
_source
)
# type: ignore
# For pickling to work, the __module__ variable needs to be set to the frame
# where the named tuple is created. Bypass this step in enviroments where
# sys._getframe is not defined (Jython for example) or sys._getframe is not
# defined for arguments greater than 0 (IronPython).
try
:
result
.
__module__
=
_sys
.
_getframe
(
1
).
f_globals
.
get
(
'__name__'
,
'__main__'
)
except
(
AttributeError
,
ValueError
):
pass
return
result
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/deepspeed_onebitadam_bsz96_config.json
0 → 100644
View file @
316d3f90
{
"train_batch_size"
:
96
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
100
,
"optimizer"
:
{
"type"
:
"OnebitAdam"
,
"params"
:
{
"lr"
:
3e-5
,
"freeze_step"
:
400
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
,
"cuda_aware"
:
false
,
"comm_backend_name"
:
"mpi"
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_deepspeed_onebitadam.sh
0 → 100644
View file @
316d3f90
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD
=
0
NCCL_IB_DISABLE
=
1
NCCL_SOCKET_IFNAME
=
eth0 deepspeed
--launcher
=
openmpi ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/mpi_ethernet/run_squad_mpi_onebitadam.sh
0 → 100644
View file @
316d3f90
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
mpirun
-n
$NGPU
-npernode
$NGPU_PER_NODE
-hostfile
/job/hostfile
-x
UCX_TLS
=
tcp
--mca
btl ^openib
--mca
btl_tcp_if_include eth0
-x
NCCL_TREE_THRESHOLD
=
0
-x
NCCL_IB_DISABLE
=
1
-x
NCCL_SOCKET_IFNAME
=
eth0 python ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/deepspeed_onebitadam_bsz96_config.json
0 → 100644
View file @
316d3f90
{
"train_batch_size"
:
96
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
100
,
"optimizer"
:
{
"type"
:
"OnebitAdam"
,
"params"
:
{
"lr"
:
3e-5
,
"freeze_step"
:
400
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
,
"cuda_aware"
:
true
,
"comm_backend_name"
:
"mpi"
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_deepspeed_onebitadam.sh
0 → 100644
View file @
316d3f90
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
NCCL_TREE_THRESHOLD
=
0 deepspeed
--launcher
=
mvapich ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/mpi_infiniband/run_squad_mpi_onebitadam.sh
0 → 100644
View file @
316d3f90
# If you are able to install pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs),
# we highly recommend you to use the NCCL-based 1-bit Adam
# which has better performance and ease of use
# (see scripts in DeepSpeedExamples/BingBertSquad/1-bit_adam/nccl
# and read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/)
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
mpirun
-n
$NGPU
-ppn
$NGPU_PER_NODE
-f
/tmp/deepspeed_mvapich_hostfile
-env
MV2_SUPPORT_DL
=
1
-env
MV2_USE_GDR
=
0
-env
MV2_USE_CUDA
=
1
-env
MV2_USE_GDRCOPY
=
0
-env
MV2_SMP_USE_CMA
=
0
-env
MV2_DEBUG_SHOW_BACKTRACE
=
1 python ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_mpi
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/1-bit_adam/nccl/deepspeed_onebitadam_bsz96_config.json
0 → 100644
View file @
316d3f90
{
"train_batch_size"
:
96
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
100
,
"optimizer"
:
{
"type"
:
"OnebitAdam"
,
"params"
:
{
"lr"
:
3e-5
,
"freeze_step"
:
400
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
,
"cuda_aware"
:
false
,
"comm_backend_name"
:
"nccl"
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/1-bit_adam/nccl/run_squad_deepspeed_onebitadam.sh
0 → 100644
View file @
316d3f90
# This script requires pytorch >= 1.8
# (and nccl >= 2.8.3 if you have 64 or more GPUs).
# Read the tutorial for more details:
# https://www.deepspeed.ai/tutorials/onebit-adam/
NUM_NODES
=
4
NGPU_PER_NODE
=
8
MODEL_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-pytorch_model.bin"
ORIGIN_CONFIG_FILE
=
"../../ckpt/bert-large-uncased-whole-word-masking-config.json"
SQUAD_DIR
=
"../../data"
OUTPUT_DIR
=
$1
LR
=
3e-5
SEED
=
$RANDOM
MASTER_PORT
=
12345
DROPOUT
=
0.1
sudo rm
-rf
${
OUTPUT_DIR
}
NGPU
=
$((
NGPU_PER_NODE
*
NUM_NODES
))
EFFECTIVE_BATCH_SIZE
=
96
MAX_GPU_BATCH_SIZE
=
3
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
JOB_NAME
=
"onebit_deepspeed_
${
NGPU
}
GPUs_
${
EFFECTIVE_BATCH_SIZE
}
batch_size"
config_json
=
deepspeed_onebitadam_bsz96_config.json
# NCCL_IB_DISABLE=1 NCCL_SOCKET_IFNAME=eth0 are used to disable infiniband. Remove it if needed.
NCCL_TREE_THRESHOLD
=
0
NCCL_IB_DISABLE
=
1
NCCL_SOCKET_IFNAME
=
eth0 deepspeed ../../nvidia_run_squad_deepspeed.py
\
--bert_model
bert-large-uncased
\
--do_train
\
--do_lower_case
\
--predict_batch_size
3
\
--do_predict
\
--train_file
$SQUAD_DIR
/train-v1.1.json
\
--predict_file
$SQUAD_DIR
/dev-v1.1.json
\
--train_batch_size
$PER_GPU_BATCH_SIZE
\
--learning_rate
${
LR
}
\
--num_train_epochs
2.0
\
--max_seq_length
384
\
--doc_stride
128
\
--output_dir
$OUTPUT_DIR
\
--job_name
${
JOB_NAME
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--fp16
\
--deepspeed
\
--deepspeed_transformer_kernel
\
--deepspeed_config
${
config_json
}
\
--dropout
${
DROPOUT
}
\
--model_file
$MODEL_FILE
\
--seed
${
SEED
}
\
--ckpt_type
HF
\
--origin_bert_config_file
${
ORIGIN_CONFIG_FILE
}
\
Deepspeed/BingBertSquad/NOTICE.txt
0 → 100644
View file @
316d3f90
NOTICES AND INFORMATION
Do Not Translate or Localize
This software incorporates material from third parties. Microsoft makes certain
open source code available at https://3rdpartysource.microsoft.com, or you may
send a check or money order for US $5.00, including the product name, the open
source component name, and version number, to:
Source Code Compliance Team
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052
USA
Notwithstanding any other terms, you may reverse engineer this software to the
extent required to debug changes to any libraries licensed under the GNU Lesser
General Public License.
Component. BingBertSquad
Open Source License/Copyright Notice.
Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Deepspeed/BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json
0 → 100644
View file @
316d3f90
{
"architectures"
:
[
"BertForMaskedLM"
],
"attention_probs_dropout_prob"
:
0.1
,
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-12
,
"max_position_embeddings"
:
512
,
"model_type"
:
"bert"
,
"num_attention_heads"
:
16
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"type_vocab_size"
:
2
,
"vocab_size"
:
30522
}
Deepspeed/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# This script references to below file from HuggingFace:
# https://github.com/huggingface/transformers/blob/d541938/src/transformers/modeling_bert.py
#
# It converts Tensorflow and Huggingface checkpoint files to DeepSpeed.
import
os
import
argparse
import
logging
import
torch
import
re
import
numpy
as
np
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
set_data
(
param
,
array
):
try
:
assert
param
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
param
.
shape
,
array
.
shape
)
raise
param
.
data
=
torch
.
from_numpy
(
array
)
def
load_tf_weights_in_bert_kernel
(
model
,
ckpt_path
,
voc_size_diff
):
""" Load tf checkpoints in DeepSpeed model.
"""
try
:
import
re
import
numpy
as
np
import
tensorflow
as
tf
except
ImportError
:
logger
.
error
(
"Loading a TensorFlow model in DeepSpeed, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path
=
os
.
path
.
abspath
(
ckpt_path
)
logger
.
info
(
"Converting TensorFlow checkpoint from {}"
.
format
(
tf_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
logger
.
info
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
qkv
=
{}
for
name_str
,
array
in
zip
(
names
,
arrays
):
name
=
name_str
.
split
(
"/"
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
,
"AdamWeightDecayOptimizer"
,
"AdamWeightDecayOptimizer_1"
,
"global_step"
]
for
n
in
name
):
logger
.
info
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
key
=
None
skipping
=
False
for
m_name
in
name
:
if
re
.
fullmatch
(
r
"[A-Za-z]+_\d+"
,
m_name
):
scope_names
=
re
.
split
(
r
"_(\d+)"
,
m_name
)
else
:
scope_names
=
[
m_name
]
if
scope_names
[
0
]
==
"kernel"
or
scope_names
[
0
]
==
"gamma"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
scope_names
[
0
]
==
"output_bias"
or
scope_names
[
0
]
==
"beta"
:
pointer
=
getattr
(
pointer
,
"bias"
)
elif
scope_names
[
0
]
==
"output_weights"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
scope_names
[
0
]
==
"squad"
:
pointer
=
getattr
(
pointer
,
"classifier"
)
# Special in deepspeed.
elif
name_str
.
find
(
"bert/pooler/dense"
)
>=
0
and
scope_names
[
0
]
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
elif
name_str
.
find
(
"bert/embeddings/LayerNorm/gamma"
)
>=
0
and
scope_names
[
0
]
==
"gamma"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
name_str
.
find
(
"bert/embeddings/LayerNorm/beta"
)
>=
0
and
scope_names
[
0
]
==
"beta"
:
pointer
=
getattr
(
pointer
,
"bias"
)
else
:
try
:
pointer
=
getattr
(
pointer
,
scope_names
[
0
])
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
skipping
=
True
break
if
len
(
scope_names
)
>=
2
:
num
=
int
(
scope_names
[
1
])
pointer
=
pointer
[
num
]
# For transofrmer kernel layers.
if
scope_names
[
0
]
==
'layer'
:
if
name_str
.
find
(
"attention/self/query/kernel"
)
>
0
:
key
=
"qw"
elif
name_str
.
find
(
"attention/self/query/bias"
)
>
0
:
key
=
"qb"
elif
name_str
.
find
(
"attention/self/key/kernel"
)
>
0
:
key
=
"kw"
elif
name_str
.
find
(
"attention/self/key/bias"
)
>
0
:
key
=
"kb"
elif
name_str
.
find
(
"attention/self/value/kernel"
)
>
0
:
key
=
"vw"
elif
name_str
.
find
(
"attention/self/value/bias"
)
>
0
:
key
=
"vb"
elif
name_str
.
find
(
"attention/output/dense/kernel"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ow"
)
elif
name_str
.
find
(
"attention/output/dense/bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ob"
)
elif
name_str
.
find
(
"attention/output/LayerNorm/gamma"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nw"
)
elif
name_str
.
find
(
"attention/output/LayerNorm/beta"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nb"
)
elif
name_str
.
find
(
"intermediate/dense/kernel"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_w"
)
elif
name_str
.
find
(
"intermediate/dense/bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_b"
)
elif
name_str
.
find
(
"output/dense/kernel"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_w"
)
elif
name_str
.
find
(
"output/dense/bias"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_b"
)
elif
name_str
.
find
(
"output/LayerNorm/gamma"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_w"
)
elif
name_str
.
find
(
"output/LayerNorm/beta"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_b"
)
else
:
raise
ValueError
(
f
"unexpect scope name
{
name_str
}
in transformer layer."
)
break
if
skipping
:
continue
if
m_name
[
-
11
:]
==
"_embeddings"
:
pointer
=
getattr
(
pointer
,
"weight"
)
elif
"kernel"
in
name
:
array
=
np
.
transpose
(
array
)
if
key
is
not
None
:
qkv
[
key
]
=
array
if
all
(
k
in
qkv
for
k
in
(
"qw"
,
"kw"
,
"vw"
)):
array
=
np
.
concatenate
((
qkv
[
"qw"
],
qkv
[
"kw"
],
qkv
[
"vw"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvw"
)
qkv
.
pop
(
"qw"
)
qkv
.
pop
(
"kw"
)
qkv
.
pop
(
"vw"
)
elif
all
(
k
in
qkv
for
k
in
(
"qb"
,
"kb"
,
"vb"
)):
array
=
np
.
concatenate
((
qkv
[
"qb"
],
qkv
[
"kb"
],
qkv
[
"vb"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvb"
)
qkv
.
pop
(
"qb"
)
qkv
.
pop
(
"kb"
)
qkv
.
pop
(
"vb"
)
elif
key
is
not
None
:
# For Q/K/V weight/bias in TF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if
voc_size_diff
>
0
and
name_str
.
find
(
"embeddings/word_embeddings"
)
>=
0
:
z
=
np
.
zeros
((
voc_size_diff
,
array
.
shape
[
1
]),
dtype
=
array
.
dtype
)
array
=
np
.
concatenate
((
array
,
z
),
axis
=
0
)
set_data
(
pointer
,
array
)
logger
.
info
(
"Initialize DeepSpeed weight {}"
.
format
(
name
))
return
model
def
load_hf_weights_in_bert_kernel
(
model
,
ckpt_path
,
voc_size_diff
):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path
=
os
.
path
.
abspath
(
ckpt_path
)
logger
.
info
(
"Converting Huggingface checkpoint from {}"
.
format
(
hf_path
))
# Load weights from Huggingface model
ckpt
=
torch
.
load
(
hf_path
,
map_location
=
torch
.
device
(
"cpu"
))
qkv
=
{}
for
name_str
in
ckpt
.
keys
():
array
=
ckpt
[
name_str
].
numpy
()
logger
.
info
(
"Loading Huggingface weight {} with shape {}"
.
format
(
name_str
,
array
.
shape
))
name
=
name_str
.
split
(
"."
)
pointer
=
model
key
=
None
is_layer
=
False
skipping
=
False
for
m_name
in
name
:
# Special in deepspeed.
if
name_str
.
find
(
"bert.pooler.dense"
)
>=
0
and
m_name
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
elif
is_layer
:
pass
else
:
try
:
pointer
=
getattr
(
pointer
,
m_name
)
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"."
.
join
(
name
)))
skipping
=
True
break
if
m_name
==
"layer"
:
is_layer
=
True
continue
if
m_name
.
isnumeric
()
and
is_layer
:
num
=
int
(
m_name
)
pointer
=
pointer
[
num
]
is_layer
=
False
# For transofrmer kernel layers.
if
name_str
.
find
(
"attention.self.query.weight"
)
>
0
:
key
=
"qw"
elif
name_str
.
find
(
"attention.self.query.bias"
)
>
0
:
key
=
"qb"
elif
name_str
.
find
(
"attention.self.key.weight"
)
>
0
:
key
=
"kw"
elif
name_str
.
find
(
"attention.self.key.bias"
)
>
0
:
key
=
"kb"
elif
name_str
.
find
(
"attention.self.value.weight"
)
>
0
:
key
=
"vw"
elif
name_str
.
find
(
"attention.self.value.bias"
)
>
0
:
key
=
"vb"
elif
name_str
.
find
(
"attention.output.dense.weight"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ow"
)
elif
name_str
.
find
(
"attention.output.dense.bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_ob"
)
elif
name_str
.
find
(
"attention.output.LayerNorm.weight"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nw"
)
elif
name_str
.
find
(
"attention.output.LayerNorm.bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"attn_nb"
)
elif
name_str
.
find
(
"intermediate.dense.weight"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_w"
)
elif
name_str
.
find
(
"intermediate.dense.bias"
)
>
0
:
pointer
=
getattr
(
pointer
,
"inter_b"
)
elif
name_str
.
find
(
"output.dense.weight"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_w"
)
elif
name_str
.
find
(
"output.dense.bias"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"output_b"
)
elif
name_str
.
find
(
"output.LayerNorm.weight"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_w"
)
elif
name_str
.
find
(
"output.LayerNorm.bias"
)
>
0
and
name_str
.
find
(
"attention"
)
<
0
:
pointer
=
getattr
(
pointer
,
"norm_b"
)
else
:
raise
ValueError
(
f
"unexpect scope name
{
name_str
}
in transformer layer."
)
break
if
skipping
:
continue
if
key
is
not
None
:
qkv
[
key
]
=
array
if
all
(
k
in
qkv
for
k
in
(
"qw"
,
"kw"
,
"vw"
)):
array
=
np
.
concatenate
((
qkv
[
"qw"
],
qkv
[
"kw"
],
qkv
[
"vw"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvw"
)
qkv
.
pop
(
"qw"
)
qkv
.
pop
(
"kw"
)
qkv
.
pop
(
"vw"
)
elif
all
(
k
in
qkv
for
k
in
(
"qb"
,
"kb"
,
"vb"
)):
array
=
np
.
concatenate
((
qkv
[
"qb"
],
qkv
[
"kb"
],
qkv
[
"vb"
]),
axis
=
0
)
pointer
=
getattr
(
pointer
,
"attn_qkvb"
)
qkv
.
pop
(
"qb"
)
qkv
.
pop
(
"kb"
)
qkv
.
pop
(
"vb"
)
elif
key
is
not
None
:
# For Q/K/V weight/bias in HF, do nothing if not all ready to merge.
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if
voc_size_diff
>
0
and
name_str
.
find
(
"embeddings.word_embeddings"
)
>=
0
:
z
=
np
.
zeros
((
voc_size_diff
,
array
.
shape
[
1
]),
dtype
=
array
.
dtype
)
array
=
np
.
concatenate
((
array
,
z
),
axis
=
0
)
set_data
(
pointer
,
array
)
logger
.
info
(
"Initialize DeepSpeed weight {}"
.
format
(
name
))
return
model
def
load_hf_weights_in_bert_torch
(
model
,
ckpt_path
,
voc_size_diff
):
""" Load huggingface checkpoints and convert to a deepspeed model.
"""
hf_path
=
os
.
path
.
abspath
(
ckpt_path
)
logger
.
info
(
"Converting Huggingface checkpoint from {}"
.
format
(
hf_path
))
# Load weights from Huggingface model
ckpt
=
torch
.
load
(
hf_path
,
map_location
=
torch
.
device
(
"cpu"
))
qkv
=
{}
for
name_str
in
ckpt
.
keys
():
array
=
ckpt
[
name_str
].
numpy
()
logger
.
info
(
"Loading Huggingface weight {} with shape {}"
.
format
(
name_str
,
array
.
shape
))
name
=
name_str
.
split
(
"."
)
pointer
=
model
key
=
None
is_layer
=
False
skipping
=
False
for
m_name
in
name
:
# Special in deepspeed.
if
name_str
.
find
(
"intermediate.dense"
)
>=
0
and
m_name
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
elif
name_str
.
find
(
"pooler.dense"
)
>=
0
and
m_name
==
"dense"
:
pointer
=
getattr
(
pointer
,
"dense_act"
)
else
:
try
:
pointer
=
getattr
(
pointer
,
m_name
)
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"."
.
join
(
name
)))
skipping
=
True
break
if
skipping
:
continue
# DeepSpeed BERT model has voc_size 8 aligned.
if
voc_size_diff
>
0
and
name_str
.
find
(
"embeddings.word_embeddings"
)
>=
0
:
z
=
np
.
zeros
((
voc_size_diff
,
array
.
shape
[
1
]),
dtype
=
array
.
dtype
)
array
=
np
.
concatenate
((
array
,
z
),
axis
=
0
)
set_data
(
pointer
,
array
)
logger
.
info
(
"Initialize DeepSpeed weight {}"
.
format
(
name
))
return
model
def
convert_ckpt_to_deepspeed
(
model
,
ckpt_type
,
ckpt_path
,
vocab_diff
,
kernel_enabled
):
# Load weights from checkpoint
if
ckpt_type
==
"HF"
:
if
kernel_enabled
:
load_hf_weights_in_bert_kernel
(
model
,
ckpt_path
,
vocab_diff
)
else
:
load_hf_weights_in_bert_torch
(
model
,
ckpt_path
,
vocab_diff
)
elif
ckpt_type
==
"TF"
:
if
kernel_enabled
:
load_tf_weights_in_bert_kernel
(
model
,
ckpt_path
,
vocab_diff
)
else
:
raise
ValueError
(
"--deepspeed_transformer_kernel is required for loading TF checkpoint."
)
else
:
raise
ValueError
(
f
"Invalid ckpt_type."
)
Deepspeed/BingBertSquad/deepspeed_bsz24_config.json
0 → 100644
View file @
316d3f90
{
"train_batch_size"
:
24
,
"train_micro_batch_size_per_gpu"
:
3
,
"steps_per_print"
:
10
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
3e-5
,
"weight_decay"
:
0.0
,
"bias_correction"
:
false
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
true
}
}
Deepspeed/BingBertSquad/evaluate-v1.1.py
0 → 100644
View file @
316d3f90
import
argparse
import
json
import
evaluate
as
eval
if
__name__
==
'__main__'
:
expected_version
=
'1.1'
parser
=
argparse
.
ArgumentParser
(
description
=
'Evaluation for SQuAD '
+
expected_version
)
parser
.
add_argument
(
'dataset_file'
,
help
=
'Dataset file'
)
parser
.
add_argument
(
'prediction_file'
,
help
=
'Prediction File'
)
args
=
parser
.
parse_args
()
print
(
json
.
dumps
(
eval
.
evaluate
(
expected_version
,
args
.
dataset_file
,
args
.
prediction_file
)))
Prev
1
2
3
4
5
6
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment