Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
ddd93c0e
Commit
ddd93c0e
authored
Sep 29, 2022
by
hepj
Browse files
增加VIT模型代码
parent
bedf3c0c
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
1080 additions
and
0 deletions
+1080
-0
PyTorch/NLP/Vision_Transformer/submitit_finetune.py
PyTorch/NLP/Vision_Transformer/submitit_finetune.py
+131
-0
PyTorch/NLP/Vision_Transformer/submitit_linprobe.py
PyTorch/NLP/Vision_Transformer/submitit_linprobe.py
+131
-0
PyTorch/NLP/Vision_Transformer/submitit_pretrain.py
PyTorch/NLP/Vision_Transformer/submitit_pretrain.py
+131
-0
PyTorch/NLP/Vision_Transformer/util/crop.py
PyTorch/NLP/Vision_Transformer/util/crop.py
+42
-0
PyTorch/NLP/Vision_Transformer/util/datasets.py
PyTorch/NLP/Vision_Transformer/util/datasets.py
+65
-0
PyTorch/NLP/Vision_Transformer/util/lars.py
PyTorch/NLP/Vision_Transformer/util/lars.py
+47
-0
PyTorch/NLP/Vision_Transformer/util/lr_decay.py
PyTorch/NLP/Vision_Transformer/util/lr_decay.py
+76
-0
PyTorch/NLP/Vision_Transformer/util/lr_sched.py
PyTorch/NLP/Vision_Transformer/util/lr_sched.py
+21
-0
PyTorch/NLP/Vision_Transformer/util/misc.py
PyTorch/NLP/Vision_Transformer/util/misc.py
+340
-0
PyTorch/NLP/Vision_Transformer/util/pos_embed.py
PyTorch/NLP/Vision_Transformer/util/pos_embed.py
+96
-0
No files found.
PyTorch/NLP/Vision_Transformer/submitit_finetune.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# A script to run multinode training with submitit.
# --------------------------------------------------------
import
argparse
import
os
import
uuid
from
pathlib
import
Path
import
main_finetune
as
classification
import
submitit
def
parse_args
():
classification_parser
=
classification
.
get_args_parser
()
parser
=
argparse
.
ArgumentParser
(
"Submitit for MAE finetune"
,
parents
=
[
classification_parser
])
parser
.
add_argument
(
"--ngpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of gpus to request on each node"
)
parser
.
add_argument
(
"--nodes"
,
default
=
2
,
type
=
int
,
help
=
"Number of nodes to request"
)
parser
.
add_argument
(
"--timeout"
,
default
=
4320
,
type
=
int
,
help
=
"Duration of the job"
)
parser
.
add_argument
(
"--job_dir"
,
default
=
""
,
type
=
str
,
help
=
"Job dir. Leave empty for automatic."
)
parser
.
add_argument
(
"--partition"
,
default
=
"learnfair"
,
type
=
str
,
help
=
"Partition where to submit"
)
parser
.
add_argument
(
"--use_volta32"
,
action
=
'store_true'
,
help
=
"Request 32G V100 GPUs"
)
parser
.
add_argument
(
'--comment'
,
default
=
""
,
type
=
str
,
help
=
"Comment to pass to scheduler"
)
return
parser
.
parse_args
()
def
get_shared_folder
()
->
Path
:
user
=
os
.
getenv
(
"USER"
)
if
Path
(
"/checkpoint/"
).
is_dir
():
p
=
Path
(
f
"/checkpoint/
{
user
}
/experiments"
)
p
.
mkdir
(
exist_ok
=
True
)
return
p
raise
RuntimeError
(
"No shared folder available"
)
def
get_init_file
():
# Init file must not exist, but it's parent dir must exist.
os
.
makedirs
(
str
(
get_shared_folder
()),
exist_ok
=
True
)
init_file
=
get_shared_folder
()
/
f
"
{
uuid
.
uuid4
().
hex
}
_init"
if
init_file
.
exists
():
os
.
remove
(
str
(
init_file
))
return
init_file
class
Trainer
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
__call__
(
self
):
import
main_finetune
as
classification
self
.
_setup_gpu_args
()
classification
.
main
(
self
.
args
)
def
checkpoint
(
self
):
import
os
import
submitit
self
.
args
.
dist_url
=
get_init_file
().
as_uri
()
checkpoint_file
=
os
.
path
.
join
(
self
.
args
.
output_dir
,
"checkpoint.pth"
)
if
os
.
path
.
exists
(
checkpoint_file
):
self
.
args
.
resume
=
checkpoint_file
print
(
"Requeuing "
,
self
.
args
)
empty_trainer
=
type
(
self
)(
self
.
args
)
return
submitit
.
helpers
.
DelayedSubmission
(
empty_trainer
)
def
_setup_gpu_args
(
self
):
import
submitit
from
pathlib
import
Path
job_env
=
submitit
.
JobEnvironment
()
self
.
args
.
output_dir
=
Path
(
str
(
self
.
args
.
output_dir
).
replace
(
"%j"
,
str
(
job_env
.
job_id
)))
self
.
args
.
log_dir
=
self
.
args
.
output_dir
self
.
args
.
gpu
=
job_env
.
local_rank
self
.
args
.
rank
=
job_env
.
global_rank
self
.
args
.
world_size
=
job_env
.
num_tasks
print
(
f
"Process group:
{
job_env
.
num_tasks
}
tasks, rank:
{
job_env
.
global_rank
}
"
)
def
main
():
args
=
parse_args
()
if
args
.
job_dir
==
""
:
args
.
job_dir
=
get_shared_folder
()
/
"%j"
# Note that the folder will depend on the job_id, to easily track experiments
executor
=
submitit
.
AutoExecutor
(
folder
=
args
.
job_dir
,
slurm_max_num_timeout
=
30
)
num_gpus_per_node
=
args
.
ngpus
nodes
=
args
.
nodes
timeout_min
=
args
.
timeout
partition
=
args
.
partition
kwargs
=
{}
if
args
.
use_volta32
:
kwargs
[
'slurm_constraint'
]
=
'volta32gb'
if
args
.
comment
:
kwargs
[
'slurm_comment'
]
=
args
.
comment
executor
.
update_parameters
(
mem_gb
=
40
*
num_gpus_per_node
,
gpus_per_node
=
num_gpus_per_node
,
tasks_per_node
=
num_gpus_per_node
,
# one task per GPU
cpus_per_task
=
10
,
nodes
=
nodes
,
timeout_min
=
timeout_min
,
# Below are cluster dependent parameters
slurm_partition
=
partition
,
slurm_signal_delay_s
=
120
,
**
kwargs
)
executor
.
update_parameters
(
name
=
"mae"
)
args
.
dist_url
=
get_init_file
().
as_uri
()
args
.
output_dir
=
args
.
job_dir
trainer
=
Trainer
(
args
)
job
=
executor
.
submit
(
trainer
)
# print("Submitted job_id:", job.job_id)
print
(
job
.
job_id
)
if
__name__
==
"__main__"
:
main
()
PyTorch/NLP/Vision_Transformer/submitit_linprobe.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# A script to run multinode training with submitit.
# --------------------------------------------------------
import
argparse
import
os
import
uuid
from
pathlib
import
Path
import
main_linprobe
as
classification
import
submitit
def
parse_args
():
classification_parser
=
classification
.
get_args_parser
()
parser
=
argparse
.
ArgumentParser
(
"Submitit for MAE linear probe"
,
parents
=
[
classification_parser
])
parser
.
add_argument
(
"--ngpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of gpus to request on each node"
)
parser
.
add_argument
(
"--nodes"
,
default
=
2
,
type
=
int
,
help
=
"Number of nodes to request"
)
parser
.
add_argument
(
"--timeout"
,
default
=
4320
,
type
=
int
,
help
=
"Duration of the job"
)
parser
.
add_argument
(
"--job_dir"
,
default
=
""
,
type
=
str
,
help
=
"Job dir. Leave empty for automatic."
)
parser
.
add_argument
(
"--partition"
,
default
=
"learnfair"
,
type
=
str
,
help
=
"Partition where to submit"
)
parser
.
add_argument
(
"--use_volta32"
,
action
=
'store_true'
,
help
=
"Request 32G V100 GPUs"
)
parser
.
add_argument
(
'--comment'
,
default
=
""
,
type
=
str
,
help
=
"Comment to pass to scheduler"
)
return
parser
.
parse_args
()
def
get_shared_folder
()
->
Path
:
user
=
os
.
getenv
(
"USER"
)
if
Path
(
"/checkpoint/"
).
is_dir
():
p
=
Path
(
f
"/checkpoint/
{
user
}
/experiments"
)
p
.
mkdir
(
exist_ok
=
True
)
return
p
raise
RuntimeError
(
"No shared folder available"
)
def
get_init_file
():
# Init file must not exist, but it's parent dir must exist.
os
.
makedirs
(
str
(
get_shared_folder
()),
exist_ok
=
True
)
init_file
=
get_shared_folder
()
/
f
"
{
uuid
.
uuid4
().
hex
}
_init"
if
init_file
.
exists
():
os
.
remove
(
str
(
init_file
))
return
init_file
class
Trainer
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
__call__
(
self
):
import
main_linprobe
as
classification
self
.
_setup_gpu_args
()
classification
.
main
(
self
.
args
)
def
checkpoint
(
self
):
import
os
import
submitit
self
.
args
.
dist_url
=
get_init_file
().
as_uri
()
checkpoint_file
=
os
.
path
.
join
(
self
.
args
.
output_dir
,
"checkpoint.pth"
)
if
os
.
path
.
exists
(
checkpoint_file
):
self
.
args
.
resume
=
checkpoint_file
print
(
"Requeuing "
,
self
.
args
)
empty_trainer
=
type
(
self
)(
self
.
args
)
return
submitit
.
helpers
.
DelayedSubmission
(
empty_trainer
)
def
_setup_gpu_args
(
self
):
import
submitit
from
pathlib
import
Path
job_env
=
submitit
.
JobEnvironment
()
self
.
args
.
output_dir
=
Path
(
str
(
self
.
args
.
output_dir
).
replace
(
"%j"
,
str
(
job_env
.
job_id
)))
self
.
args
.
log_dir
=
self
.
args
.
output_dir
self
.
args
.
gpu
=
job_env
.
local_rank
self
.
args
.
rank
=
job_env
.
global_rank
self
.
args
.
world_size
=
job_env
.
num_tasks
print
(
f
"Process group:
{
job_env
.
num_tasks
}
tasks, rank:
{
job_env
.
global_rank
}
"
)
def
main
():
args
=
parse_args
()
if
args
.
job_dir
==
""
:
args
.
job_dir
=
get_shared_folder
()
/
"%j"
# Note that the folder will depend on the job_id, to easily track experiments
executor
=
submitit
.
AutoExecutor
(
folder
=
args
.
job_dir
,
slurm_max_num_timeout
=
30
)
num_gpus_per_node
=
args
.
ngpus
nodes
=
args
.
nodes
timeout_min
=
args
.
timeout
partition
=
args
.
partition
kwargs
=
{}
if
args
.
use_volta32
:
kwargs
[
'slurm_constraint'
]
=
'volta32gb'
if
args
.
comment
:
kwargs
[
'slurm_comment'
]
=
args
.
comment
executor
.
update_parameters
(
mem_gb
=
40
*
num_gpus_per_node
,
gpus_per_node
=
num_gpus_per_node
,
tasks_per_node
=
num_gpus_per_node
,
# one task per GPU
cpus_per_task
=
10
,
nodes
=
nodes
,
timeout_min
=
timeout_min
,
# Below are cluster dependent parameters
slurm_partition
=
partition
,
slurm_signal_delay_s
=
120
,
**
kwargs
)
executor
.
update_parameters
(
name
=
"mae"
)
args
.
dist_url
=
get_init_file
().
as_uri
()
args
.
output_dir
=
args
.
job_dir
trainer
=
Trainer
(
args
)
job
=
executor
.
submit
(
trainer
)
# print("Submitted job_id:", job.job_id)
print
(
job
.
job_id
)
if
__name__
==
"__main__"
:
main
()
PyTorch/NLP/Vision_Transformer/submitit_pretrain.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# A script to run multinode training with submitit.
# --------------------------------------------------------
import
argparse
import
os
import
uuid
from
pathlib
import
Path
import
main_pretrain
as
trainer
import
submitit
def
parse_args
():
trainer_parser
=
trainer
.
get_args_parser
()
parser
=
argparse
.
ArgumentParser
(
"Submitit for MAE pretrain"
,
parents
=
[
trainer_parser
])
parser
.
add_argument
(
"--ngpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of gpus to request on each node"
)
parser
.
add_argument
(
"--nodes"
,
default
=
2
,
type
=
int
,
help
=
"Number of nodes to request"
)
parser
.
add_argument
(
"--timeout"
,
default
=
4320
,
type
=
int
,
help
=
"Duration of the job"
)
parser
.
add_argument
(
"--job_dir"
,
default
=
""
,
type
=
str
,
help
=
"Job dir. Leave empty for automatic."
)
parser
.
add_argument
(
"--partition"
,
default
=
"learnfair"
,
type
=
str
,
help
=
"Partition where to submit"
)
parser
.
add_argument
(
"--use_volta32"
,
action
=
'store_true'
,
help
=
"Request 32G V100 GPUs"
)
parser
.
add_argument
(
'--comment'
,
default
=
""
,
type
=
str
,
help
=
"Comment to pass to scheduler"
)
return
parser
.
parse_args
()
def
get_shared_folder
()
->
Path
:
user
=
os
.
getenv
(
"USER"
)
if
Path
(
"/checkpoint/"
).
is_dir
():
p
=
Path
(
f
"/checkpoint/
{
user
}
/experiments"
)
p
.
mkdir
(
exist_ok
=
True
)
return
p
raise
RuntimeError
(
"No shared folder available"
)
def
get_init_file
():
# Init file must not exist, but it's parent dir must exist.
os
.
makedirs
(
str
(
get_shared_folder
()),
exist_ok
=
True
)
init_file
=
get_shared_folder
()
/
f
"
{
uuid
.
uuid4
().
hex
}
_init"
if
init_file
.
exists
():
os
.
remove
(
str
(
init_file
))
return
init_file
class
Trainer
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
__call__
(
self
):
import
main_pretrain
as
trainer
self
.
_setup_gpu_args
()
trainer
.
main
(
self
.
args
)
def
checkpoint
(
self
):
import
os
import
submitit
self
.
args
.
dist_url
=
get_init_file
().
as_uri
()
checkpoint_file
=
os
.
path
.
join
(
self
.
args
.
output_dir
,
"checkpoint.pth"
)
if
os
.
path
.
exists
(
checkpoint_file
):
self
.
args
.
resume
=
checkpoint_file
print
(
"Requeuing "
,
self
.
args
)
empty_trainer
=
type
(
self
)(
self
.
args
)
return
submitit
.
helpers
.
DelayedSubmission
(
empty_trainer
)
def
_setup_gpu_args
(
self
):
import
submitit
from
pathlib
import
Path
job_env
=
submitit
.
JobEnvironment
()
self
.
args
.
output_dir
=
Path
(
str
(
self
.
args
.
output_dir
).
replace
(
"%j"
,
str
(
job_env
.
job_id
)))
self
.
args
.
log_dir
=
self
.
args
.
output_dir
self
.
args
.
gpu
=
job_env
.
local_rank
self
.
args
.
rank
=
job_env
.
global_rank
self
.
args
.
world_size
=
job_env
.
num_tasks
print
(
f
"Process group:
{
job_env
.
num_tasks
}
tasks, rank:
{
job_env
.
global_rank
}
"
)
def
main
():
args
=
parse_args
()
if
args
.
job_dir
==
""
:
args
.
job_dir
=
get_shared_folder
()
/
"%j"
# Note that the folder will depend on the job_id, to easily track experiments
executor
=
submitit
.
AutoExecutor
(
folder
=
args
.
job_dir
,
slurm_max_num_timeout
=
30
)
num_gpus_per_node
=
args
.
ngpus
nodes
=
args
.
nodes
timeout_min
=
args
.
timeout
partition
=
args
.
partition
kwargs
=
{}
if
args
.
use_volta32
:
kwargs
[
'slurm_constraint'
]
=
'volta32gb'
if
args
.
comment
:
kwargs
[
'slurm_comment'
]
=
args
.
comment
executor
.
update_parameters
(
mem_gb
=
40
*
num_gpus_per_node
,
gpus_per_node
=
num_gpus_per_node
,
tasks_per_node
=
num_gpus_per_node
,
# one task per GPU
cpus_per_task
=
10
,
nodes
=
nodes
,
timeout_min
=
timeout_min
,
# max is 60 * 72
# Below are cluster dependent parameters
slurm_partition
=
partition
,
slurm_signal_delay_s
=
120
,
**
kwargs
)
executor
.
update_parameters
(
name
=
"mae"
)
args
.
dist_url
=
get_init_file
().
as_uri
()
args
.
output_dir
=
args
.
job_dir
trainer
=
Trainer
(
args
)
job
=
executor
.
submit
(
trainer
)
# print("Submitted job_id:", job.job_id)
print
(
job
.
job_id
)
if
__name__
==
"__main__"
:
main
()
PyTorch/NLP/Vision_Transformer/util/crop.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import
math
import
torch
from
torchvision
import
transforms
from
torchvision.transforms
import
functional
as
F
class
RandomResizedCrop
(
transforms
.
RandomResizedCrop
):
"""
RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
This may lead to results different with torchvision's version.
Following BYOL's TF code:
https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
"""
@
staticmethod
def
get_params
(
img
,
scale
,
ratio
):
width
,
height
=
F
.
_get_image_size
(
img
)
area
=
height
*
width
target_area
=
area
*
torch
.
empty
(
1
).
uniform_
(
scale
[
0
],
scale
[
1
]).
item
()
log_ratio
=
torch
.
log
(
torch
.
tensor
(
ratio
))
aspect_ratio
=
torch
.
exp
(
torch
.
empty
(
1
).
uniform_
(
log_ratio
[
0
],
log_ratio
[
1
])
).
item
()
w
=
int
(
round
(
math
.
sqrt
(
target_area
*
aspect_ratio
)))
h
=
int
(
round
(
math
.
sqrt
(
target_area
/
aspect_ratio
)))
w
=
min
(
w
,
width
)
h
=
min
(
h
,
height
)
i
=
torch
.
randint
(
0
,
height
-
h
+
1
,
size
=
(
1
,)).
item
()
j
=
torch
.
randint
(
0
,
width
-
w
+
1
,
size
=
(
1
,)).
item
()
return
i
,
j
,
h
,
w
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/datasets.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DeiT: https://github.com/facebookresearch/deit
# --------------------------------------------------------
import
os
import
PIL
from
torchvision
import
datasets
,
transforms
from
timm.data
import
create_transform
from
timm.data.constants
import
IMAGENET_DEFAULT_MEAN
,
IMAGENET_DEFAULT_STD
def
build_dataset
(
is_train
,
args
):
transform
=
build_transform
(
is_train
,
args
)
root
=
os
.
path
.
join
(
args
.
data_path
,
'train'
if
is_train
else
'val'
)
dataset
=
datasets
.
ImageFolder
(
root
,
transform
=
transform
)
print
(
dataset
)
return
dataset
def
build_transform
(
is_train
,
args
):
mean
=
IMAGENET_DEFAULT_MEAN
std
=
IMAGENET_DEFAULT_STD
# train transform
if
is_train
:
# this should always dispatch to transforms_imagenet_train
transform
=
create_transform
(
input_size
=
args
.
input_size
,
is_training
=
True
,
color_jitter
=
args
.
color_jitter
,
auto_augment
=
args
.
aa
,
interpolation
=
'bicubic'
,
re_prob
=
args
.
reprob
,
re_mode
=
args
.
remode
,
re_count
=
args
.
recount
,
mean
=
mean
,
std
=
std
,
)
return
transform
# eval transform
t
=
[]
if
args
.
input_size
<=
224
:
crop_pct
=
224
/
256
else
:
crop_pct
=
1.0
size
=
int
(
args
.
input_size
/
crop_pct
)
t
.
append
(
transforms
.
Resize
(
size
,
interpolation
=
PIL
.
Image
.
BICUBIC
),
# to maintain same ratio w.r.t. 224 images
)
t
.
append
(
transforms
.
CenterCrop
(
args
.
input_size
))
t
.
append
(
transforms
.
ToTensor
())
t
.
append
(
transforms
.
Normalize
(
mean
,
std
))
return
transforms
.
Compose
(
t
)
PyTorch/NLP/Vision_Transformer/util/lars.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# LARS optimizer, implementation from MoCo v3:
# https://github.com/facebookresearch/moco-v3
# --------------------------------------------------------
import
torch
class
LARS
(
torch
.
optim
.
Optimizer
):
"""
LARS optimizer, no rate scaling or weight decay for parameters <= 1D.
"""
def
__init__
(
self
,
params
,
lr
=
0
,
weight_decay
=
0
,
momentum
=
0.9
,
trust_coefficient
=
0.001
):
defaults
=
dict
(
lr
=
lr
,
weight_decay
=
weight_decay
,
momentum
=
momentum
,
trust_coefficient
=
trust_coefficient
)
super
().
__init__
(
params
,
defaults
)
@
torch
.
no_grad
()
def
step
(
self
):
for
g
in
self
.
param_groups
:
for
p
in
g
[
'params'
]:
dp
=
p
.
grad
if
dp
is
None
:
continue
if
p
.
ndim
>
1
:
# if not normalization gamma/beta or bias
dp
=
dp
.
add
(
p
,
alpha
=
g
[
'weight_decay'
])
param_norm
=
torch
.
norm
(
p
)
update_norm
=
torch
.
norm
(
dp
)
one
=
torch
.
ones_like
(
param_norm
)
q
=
torch
.
where
(
param_norm
>
0.
,
torch
.
where
(
update_norm
>
0
,
(
g
[
'trust_coefficient'
]
*
param_norm
/
update_norm
),
one
),
one
)
dp
=
dp
.
mul
(
q
)
param_state
=
self
.
state
[
p
]
if
'mu'
not
in
param_state
:
param_state
[
'mu'
]
=
torch
.
zeros_like
(
p
)
mu
=
param_state
[
'mu'
]
mu
.
mul_
(
g
[
'momentum'
]).
add_
(
dp
)
p
.
add_
(
mu
,
alpha
=-
g
[
'lr'
])
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/lr_decay.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# ELECTRA https://github.com/google-research/electra
# BEiT: https://github.com/microsoft/unilm/tree/master/beit
# --------------------------------------------------------
import
json
def
param_groups_lrd
(
model
,
weight_decay
=
0.05
,
no_weight_decay_list
=
[],
layer_decay
=
.
75
):
"""
Parameter groups for layer-wise lr decay
Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
"""
param_group_names
=
{}
param_groups
=
{}
num_layers
=
len
(
model
.
blocks
)
+
1
layer_scales
=
list
(
layer_decay
**
(
num_layers
-
i
)
for
i
in
range
(
num_layers
+
1
))
for
n
,
p
in
model
.
named_parameters
():
if
not
p
.
requires_grad
:
continue
# no decay: all 1D parameters and model specific ones
if
p
.
ndim
==
1
or
n
in
no_weight_decay_list
:
g_decay
=
"no_decay"
this_decay
=
0.
else
:
g_decay
=
"decay"
this_decay
=
weight_decay
layer_id
=
get_layer_id_for_vit
(
n
,
num_layers
)
group_name
=
"layer_%d_%s"
%
(
layer_id
,
g_decay
)
if
group_name
not
in
param_group_names
:
this_scale
=
layer_scales
[
layer_id
]
param_group_names
[
group_name
]
=
{
"lr_scale"
:
this_scale
,
"weight_decay"
:
this_decay
,
"params"
:
[],
}
param_groups
[
group_name
]
=
{
"lr_scale"
:
this_scale
,
"weight_decay"
:
this_decay
,
"params"
:
[],
}
param_group_names
[
group_name
][
"params"
].
append
(
n
)
param_groups
[
group_name
][
"params"
].
append
(
p
)
# print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
return
list
(
param_groups
.
values
())
def
get_layer_id_for_vit
(
name
,
num_layers
):
"""
Assign a parameter with its layer id
Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
"""
if
name
in
[
'cls_token'
,
'pos_embed'
]:
return
0
elif
name
.
startswith
(
'patch_embed'
):
return
0
elif
name
.
startswith
(
'blocks'
):
return
int
(
name
.
split
(
'.'
)[
1
])
+
1
else
:
return
num_layers
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/lr_sched.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import
math
def
adjust_learning_rate
(
optimizer
,
epoch
,
args
):
"""Decay the learning rate with half-cycle cosine after warmup"""
if
epoch
<
args
.
warmup_epochs
:
lr
=
args
.
lr
*
epoch
/
args
.
warmup_epochs
else
:
lr
=
args
.
min_lr
+
(
args
.
lr
-
args
.
min_lr
)
*
0.5
*
\
(
1.
+
math
.
cos
(
math
.
pi
*
(
epoch
-
args
.
warmup_epochs
)
/
(
args
.
epochs
-
args
.
warmup_epochs
)))
for
param_group
in
optimizer
.
param_groups
:
if
"lr_scale"
in
param_group
:
param_group
[
"lr"
]
=
lr
*
param_group
[
"lr_scale"
]
else
:
param_group
[
"lr"
]
=
lr
return
lr
PyTorch/NLP/Vision_Transformer/util/misc.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DeiT: https://github.com/facebookresearch/deit
# BEiT: https://github.com/microsoft/unilm/tree/master/beit
# --------------------------------------------------------
import
builtins
import
datetime
import
os
import
time
from
collections
import
defaultdict
,
deque
from
pathlib
import
Path
import
torch
import
torch.distributed
as
dist
from
torch._six
import
inf
class
SmoothedValue
(
object
):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def
__init__
(
self
,
window_size
=
20
,
fmt
=
None
):
if
fmt
is
None
:
fmt
=
"{median:.4f} ({global_avg:.4f})"
self
.
deque
=
deque
(
maxlen
=
window_size
)
self
.
total
=
0.0
self
.
count
=
0
self
.
fmt
=
fmt
def
update
(
self
,
value
,
n
=
1
):
self
.
deque
.
append
(
value
)
self
.
count
+=
n
self
.
total
+=
value
*
n
def
synchronize_between_processes
(
self
):
"""
Warning: does not synchronize the deque!
"""
if
not
is_dist_avail_and_initialized
():
return
t
=
torch
.
tensor
([
self
.
count
,
self
.
total
],
dtype
=
torch
.
float64
,
device
=
'cuda'
)
dist
.
barrier
()
dist
.
all_reduce
(
t
)
t
=
t
.
tolist
()
self
.
count
=
int
(
t
[
0
])
self
.
total
=
t
[
1
]
@
property
def
median
(
self
):
d
=
torch
.
tensor
(
list
(
self
.
deque
))
return
d
.
median
().
item
()
@
property
def
avg
(
self
):
d
=
torch
.
tensor
(
list
(
self
.
deque
),
dtype
=
torch
.
float32
)
return
d
.
mean
().
item
()
@
property
def
global_avg
(
self
):
return
self
.
total
/
self
.
count
@
property
def
max
(
self
):
return
max
(
self
.
deque
)
@
property
def
value
(
self
):
return
self
.
deque
[
-
1
]
def
__str__
(
self
):
return
self
.
fmt
.
format
(
median
=
self
.
median
,
avg
=
self
.
avg
,
global_avg
=
self
.
global_avg
,
max
=
self
.
max
,
value
=
self
.
value
)
class
MetricLogger
(
object
):
def
__init__
(
self
,
delimiter
=
"
\t
"
):
self
.
meters
=
defaultdict
(
SmoothedValue
)
self
.
delimiter
=
delimiter
def
update
(
self
,
**
kwargs
):
for
k
,
v
in
kwargs
.
items
():
if
v
is
None
:
continue
if
isinstance
(
v
,
torch
.
Tensor
):
v
=
v
.
item
()
assert
isinstance
(
v
,
(
float
,
int
))
self
.
meters
[
k
].
update
(
v
)
def
__getattr__
(
self
,
attr
):
if
attr
in
self
.
meters
:
return
self
.
meters
[
attr
]
if
attr
in
self
.
__dict__
:
return
self
.
__dict__
[
attr
]
raise
AttributeError
(
"'{}' object has no attribute '{}'"
.
format
(
type
(
self
).
__name__
,
attr
))
def
__str__
(
self
):
loss_str
=
[]
for
name
,
meter
in
self
.
meters
.
items
():
loss_str
.
append
(
"{}: {}"
.
format
(
name
,
str
(
meter
))
)
return
self
.
delimiter
.
join
(
loss_str
)
def
synchronize_between_processes
(
self
):
for
meter
in
self
.
meters
.
values
():
meter
.
synchronize_between_processes
()
def
add_meter
(
self
,
name
,
meter
):
self
.
meters
[
name
]
=
meter
def
log_every
(
self
,
iterable
,
print_freq
,
header
=
None
):
i
=
0
if
not
header
:
header
=
''
start_time
=
time
.
time
()
end
=
time
.
time
()
iter_time
=
SmoothedValue
(
fmt
=
'{avg:.4f}'
)
data_time
=
SmoothedValue
(
fmt
=
'{avg:.4f}'
)
space_fmt
=
':'
+
str
(
len
(
str
(
len
(
iterable
))))
+
'd'
log_msg
=
[
header
,
'[{0'
+
space_fmt
+
'}/{1}]'
,
'eta: {eta}'
,
'{meters}'
,
'time: {time}'
,
'data: {data}'
]
if
torch
.
cuda
.
is_available
():
log_msg
.
append
(
'max mem: {memory:.0f}'
)
log_msg
=
self
.
delimiter
.
join
(
log_msg
)
MB
=
1024.0
*
1024.0
for
obj
in
iterable
:
data_time
.
update
(
time
.
time
()
-
end
)
yield
obj
iter_time
.
update
(
time
.
time
()
-
end
)
if
i
%
print_freq
==
0
or
i
==
len
(
iterable
)
-
1
:
eta_seconds
=
iter_time
.
global_avg
*
(
len
(
iterable
)
-
i
)
eta_string
=
str
(
datetime
.
timedelta
(
seconds
=
int
(
eta_seconds
)))
if
torch
.
cuda
.
is_available
():
print
(
log_msg
.
format
(
i
,
len
(
iterable
),
eta
=
eta_string
,
meters
=
str
(
self
),
time
=
str
(
iter_time
),
data
=
str
(
data_time
),
memory
=
torch
.
cuda
.
max_memory_allocated
()
/
MB
))
else
:
print
(
log_msg
.
format
(
i
,
len
(
iterable
),
eta
=
eta_string
,
meters
=
str
(
self
),
time
=
str
(
iter_time
),
data
=
str
(
data_time
)))
i
+=
1
end
=
time
.
time
()
total_time
=
time
.
time
()
-
start_time
total_time_str
=
str
(
datetime
.
timedelta
(
seconds
=
int
(
total_time
)))
print
(
'{} Total time: {} ({:.4f} s / it)'
.
format
(
header
,
total_time_str
,
total_time
/
len
(
iterable
)))
def
setup_for_distributed
(
is_master
):
"""
This function disables printing when not in master process
"""
builtin_print
=
builtins
.
print
def
print
(
*
args
,
**
kwargs
):
force
=
kwargs
.
pop
(
'force'
,
False
)
force
=
force
or
(
get_world_size
()
>
8
)
if
is_master
or
force
:
now
=
datetime
.
datetime
.
now
().
time
()
builtin_print
(
'[{}] '
.
format
(
now
),
end
=
''
)
# print with time stamp
builtin_print
(
*
args
,
**
kwargs
)
builtins
.
print
=
print
def
is_dist_avail_and_initialized
():
if
not
dist
.
is_available
():
return
False
if
not
dist
.
is_initialized
():
return
False
return
True
def
get_world_size
():
if
not
is_dist_avail_and_initialized
():
return
1
return
dist
.
get_world_size
()
def
get_rank
():
if
not
is_dist_avail_and_initialized
():
return
0
return
dist
.
get_rank
()
def
is_main_process
():
return
get_rank
()
==
0
def
save_on_master
(
*
args
,
**
kwargs
):
if
is_main_process
():
torch
.
save
(
*
args
,
**
kwargs
)
def
init_distributed_mode
(
args
):
if
args
.
dist_on_itp
:
args
.
rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
args
.
world_size
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_SIZE'
])
args
.
gpu
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_LOCAL_RANK'
])
args
.
dist_url
=
"tcp://%s:%s"
%
(
os
.
environ
[
'MASTER_ADDR'
],
os
.
environ
[
'MASTER_PORT'
])
os
.
environ
[
'LOCAL_RANK'
]
=
str
(
args
.
gpu
)
os
.
environ
[
'RANK'
]
=
str
(
args
.
rank
)
os
.
environ
[
'WORLD_SIZE'
]
=
str
(
args
.
world_size
)
# ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
elif
'RANK'
in
os
.
environ
and
'WORLD_SIZE'
in
os
.
environ
:
args
.
rank
=
int
(
os
.
environ
[
"RANK"
])
args
.
world_size
=
int
(
os
.
environ
[
'WORLD_SIZE'
])
args
.
gpu
=
int
(
os
.
environ
[
'LOCAL_RANK'
])
elif
'SLURM_PROCID'
in
os
.
environ
:
args
.
rank
=
int
(
os
.
environ
[
'SLURM_PROCID'
])
args
.
gpu
=
args
.
rank
%
torch
.
cuda
.
device_count
()
else
:
print
(
'Not using distributed mode'
)
setup_for_distributed
(
is_master
=
True
)
# hack
args
.
distributed
=
False
return
args
.
distributed
=
True
torch
.
cuda
.
set_device
(
args
.
gpu
)
args
.
dist_backend
=
'nccl'
print
(
'| distributed init (rank {}): {}, gpu {}'
.
format
(
args
.
rank
,
args
.
dist_url
,
args
.
gpu
),
flush
=
True
)
torch
.
distributed
.
init_process_group
(
backend
=
args
.
dist_backend
,
init_method
=
args
.
dist_url
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
)
torch
.
distributed
.
barrier
()
setup_for_distributed
(
args
.
rank
==
0
)
class
NativeScalerWithGradNormCount
:
state_dict_key
=
"amp_scaler"
def
__init__
(
self
):
self
.
_scaler
=
torch
.
cuda
.
amp
.
GradScaler
()
def
__call__
(
self
,
loss
,
optimizer
,
clip_grad
=
None
,
parameters
=
None
,
create_graph
=
False
,
update_grad
=
True
):
self
.
_scaler
.
scale
(
loss
).
backward
(
create_graph
=
create_graph
)
if
update_grad
:
if
clip_grad
is
not
None
:
assert
parameters
is
not
None
self
.
_scaler
.
unscale_
(
optimizer
)
# unscale the gradients of optimizer's assigned params in-place
norm
=
torch
.
nn
.
utils
.
clip_grad_norm_
(
parameters
,
clip_grad
)
else
:
self
.
_scaler
.
unscale_
(
optimizer
)
norm
=
get_grad_norm_
(
parameters
)
self
.
_scaler
.
step
(
optimizer
)
self
.
_scaler
.
update
()
else
:
norm
=
None
return
norm
def
state_dict
(
self
):
return
self
.
_scaler
.
state_dict
()
def
load_state_dict
(
self
,
state_dict
):
self
.
_scaler
.
load_state_dict
(
state_dict
)
def
get_grad_norm_
(
parameters
,
norm_type
:
float
=
2.0
)
->
torch
.
Tensor
:
if
isinstance
(
parameters
,
torch
.
Tensor
):
parameters
=
[
parameters
]
parameters
=
[
p
for
p
in
parameters
if
p
.
grad
is
not
None
]
norm_type
=
float
(
norm_type
)
if
len
(
parameters
)
==
0
:
return
torch
.
tensor
(
0.
)
device
=
parameters
[
0
].
grad
.
device
if
norm_type
==
inf
:
total_norm
=
max
(
p
.
grad
.
detach
().
abs
().
max
().
to
(
device
)
for
p
in
parameters
)
else
:
total_norm
=
torch
.
norm
(
torch
.
stack
([
torch
.
norm
(
p
.
grad
.
detach
(),
norm_type
).
to
(
device
)
for
p
in
parameters
]),
norm_type
)
return
total_norm
def
save_model
(
args
,
epoch
,
model
,
model_without_ddp
,
optimizer
,
loss_scaler
):
output_dir
=
Path
(
args
.
output_dir
)
epoch_name
=
str
(
epoch
)
if
loss_scaler
is
not
None
:
checkpoint_paths
=
[
output_dir
/
(
'checkpoint-%s.pth'
%
epoch_name
)]
for
checkpoint_path
in
checkpoint_paths
:
to_save
=
{
'model'
:
model_without_ddp
.
state_dict
(),
'optimizer'
:
optimizer
.
state_dict
(),
'epoch'
:
epoch
,
'scaler'
:
loss_scaler
.
state_dict
(),
'args'
:
args
,
}
save_on_master
(
to_save
,
checkpoint_path
)
else
:
client_state
=
{
'epoch'
:
epoch
}
model
.
save_checkpoint
(
save_dir
=
args
.
output_dir
,
tag
=
"checkpoint-%s"
%
epoch_name
,
client_state
=
client_state
)
def
load_model
(
args
,
model_without_ddp
,
optimizer
,
loss_scaler
):
if
args
.
resume
:
if
args
.
resume
.
startswith
(
'https'
):
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
args
.
resume
,
map_location
=
'cpu'
,
check_hash
=
True
)
else
:
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
'cpu'
)
model_without_ddp
.
load_state_dict
(
checkpoint
[
'model'
])
print
(
"Resume checkpoint %s"
%
args
.
resume
)
if
'optimizer'
in
checkpoint
and
'epoch'
in
checkpoint
and
not
(
hasattr
(
args
,
'eval'
)
and
args
.
eval
):
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
args
.
start_epoch
=
checkpoint
[
'epoch'
]
+
1
if
'scaler'
in
checkpoint
:
loss_scaler
.
load_state_dict
(
checkpoint
[
'scaler'
])
print
(
"With optim & sched!"
)
def
all_reduce_mean
(
x
):
world_size
=
get_world_size
()
if
world_size
>
1
:
x_reduce
=
torch
.
tensor
(
x
).
cuda
()
dist
.
all_reduce
(
x_reduce
)
x_reduce
/=
world_size
return
x_reduce
.
item
()
else
:
return
x
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/pos_embed.py
0 → 100644
View file @
ddd93c0e
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# Position embedding utils
# --------------------------------------------------------
import
numpy
as
np
import
torch
# --------------------------------------------------------
# 2D sine-cosine position embedding
# References:
# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
# MoCo v3: https://github.com/facebookresearch/moco-v3
# --------------------------------------------------------
def
get_2d_sincos_pos_embed
(
embed_dim
,
grid_size
,
cls_token
=
False
):
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
grid_h
=
np
.
arange
(
grid_size
,
dtype
=
np
.
float32
)
grid_w
=
np
.
arange
(
grid_size
,
dtype
=
np
.
float32
)
grid
=
np
.
meshgrid
(
grid_w
,
grid_h
)
# here w goes first
grid
=
np
.
stack
(
grid
,
axis
=
0
)
grid
=
grid
.
reshape
([
2
,
1
,
grid_size
,
grid_size
])
pos_embed
=
get_2d_sincos_pos_embed_from_grid
(
embed_dim
,
grid
)
if
cls_token
:
pos_embed
=
np
.
concatenate
([
np
.
zeros
([
1
,
embed_dim
]),
pos_embed
],
axis
=
0
)
return
pos_embed
def
get_2d_sincos_pos_embed_from_grid
(
embed_dim
,
grid
):
assert
embed_dim
%
2
==
0
# use half of dimensions to encode grid_h
emb_h
=
get_1d_sincos_pos_embed_from_grid
(
embed_dim
//
2
,
grid
[
0
])
# (H*W, D/2)
emb_w
=
get_1d_sincos_pos_embed_from_grid
(
embed_dim
//
2
,
grid
[
1
])
# (H*W, D/2)
emb
=
np
.
concatenate
([
emb_h
,
emb_w
],
axis
=
1
)
# (H*W, D)
return
emb
def
get_1d_sincos_pos_embed_from_grid
(
embed_dim
,
pos
):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert
embed_dim
%
2
==
0
omega
=
np
.
arange
(
embed_dim
//
2
,
dtype
=
np
.
float
)
omega
/=
embed_dim
/
2.
omega
=
1.
/
10000
**
omega
# (D/2,)
pos
=
pos
.
reshape
(
-
1
)
# (M,)
out
=
np
.
einsum
(
'm,d->md'
,
pos
,
omega
)
# (M, D/2), outer product
emb_sin
=
np
.
sin
(
out
)
# (M, D/2)
emb_cos
=
np
.
cos
(
out
)
# (M, D/2)
emb
=
np
.
concatenate
([
emb_sin
,
emb_cos
],
axis
=
1
)
# (M, D)
return
emb
# --------------------------------------------------------
# Interpolate position embeddings for high-resolution
# References:
# DeiT: https://github.com/facebookresearch/deit
# --------------------------------------------------------
def
interpolate_pos_embed
(
model
,
checkpoint_model
):
if
'pos_embed'
in
checkpoint_model
:
pos_embed_checkpoint
=
checkpoint_model
[
'pos_embed'
]
embedding_size
=
pos_embed_checkpoint
.
shape
[
-
1
]
num_patches
=
model
.
patch_embed
.
num_patches
num_extra_tokens
=
model
.
pos_embed
.
shape
[
-
2
]
-
num_patches
# height (== width) for the checkpoint position embedding
orig_size
=
int
((
pos_embed_checkpoint
.
shape
[
-
2
]
-
num_extra_tokens
)
**
0.5
)
# height (== width) for the new position embedding
new_size
=
int
(
num_patches
**
0.5
)
# class_token and dist_token are kept unchanged
if
orig_size
!=
new_size
:
print
(
"Position interpolate from %dx%d to %dx%d"
%
(
orig_size
,
orig_size
,
new_size
,
new_size
))
extra_tokens
=
pos_embed_checkpoint
[:,
:
num_extra_tokens
]
# only the position tokens are interpolated
pos_tokens
=
pos_embed_checkpoint
[:,
num_extra_tokens
:]
pos_tokens
=
pos_tokens
.
reshape
(
-
1
,
orig_size
,
orig_size
,
embedding_size
).
permute
(
0
,
3
,
1
,
2
)
pos_tokens
=
torch
.
nn
.
functional
.
interpolate
(
pos_tokens
,
size
=
(
new_size
,
new_size
),
mode
=
'bicubic'
,
align_corners
=
False
)
pos_tokens
=
pos_tokens
.
permute
(
0
,
2
,
3
,
1
).
flatten
(
1
,
2
)
new_pos_embed
=
torch
.
cat
((
extra_tokens
,
pos_tokens
),
dim
=
1
)
checkpoint_model
[
'pos_embed'
]
=
new_pos_embed
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment