Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
c056df78
Commit
c056df78
authored
Sep 30, 2022
by
sunxx1
Browse files
Merge branch 'hepj-test' into 'main'
增加VIT模型代码 See merge request dcutoolkit/deeplearing/dlexamples_new!39
parents
8ddf66c6
ddd93c0e
Changes
30
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
1080 additions
and
0 deletions
+1080
-0
PyTorch/NLP/Vision_Transformer/submitit_finetune.py
PyTorch/NLP/Vision_Transformer/submitit_finetune.py
+131
-0
PyTorch/NLP/Vision_Transformer/submitit_linprobe.py
PyTorch/NLP/Vision_Transformer/submitit_linprobe.py
+131
-0
PyTorch/NLP/Vision_Transformer/submitit_pretrain.py
PyTorch/NLP/Vision_Transformer/submitit_pretrain.py
+131
-0
PyTorch/NLP/Vision_Transformer/util/crop.py
PyTorch/NLP/Vision_Transformer/util/crop.py
+42
-0
PyTorch/NLP/Vision_Transformer/util/datasets.py
PyTorch/NLP/Vision_Transformer/util/datasets.py
+65
-0
PyTorch/NLP/Vision_Transformer/util/lars.py
PyTorch/NLP/Vision_Transformer/util/lars.py
+47
-0
PyTorch/NLP/Vision_Transformer/util/lr_decay.py
PyTorch/NLP/Vision_Transformer/util/lr_decay.py
+76
-0
PyTorch/NLP/Vision_Transformer/util/lr_sched.py
PyTorch/NLP/Vision_Transformer/util/lr_sched.py
+21
-0
PyTorch/NLP/Vision_Transformer/util/misc.py
PyTorch/NLP/Vision_Transformer/util/misc.py
+340
-0
PyTorch/NLP/Vision_Transformer/util/pos_embed.py
PyTorch/NLP/Vision_Transformer/util/pos_embed.py
+96
-0
No files found.
PyTorch/NLP/Vision_Transformer/submitit_finetune.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# A script to run multinode training with submitit.
# --------------------------------------------------------
import
argparse
import
os
import
uuid
from
pathlib
import
Path
import
main_finetune
as
classification
import
submitit
def
parse_args
():
classification_parser
=
classification
.
get_args_parser
()
parser
=
argparse
.
ArgumentParser
(
"Submitit for MAE finetune"
,
parents
=
[
classification_parser
])
parser
.
add_argument
(
"--ngpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of gpus to request on each node"
)
parser
.
add_argument
(
"--nodes"
,
default
=
2
,
type
=
int
,
help
=
"Number of nodes to request"
)
parser
.
add_argument
(
"--timeout"
,
default
=
4320
,
type
=
int
,
help
=
"Duration of the job"
)
parser
.
add_argument
(
"--job_dir"
,
default
=
""
,
type
=
str
,
help
=
"Job dir. Leave empty for automatic."
)
parser
.
add_argument
(
"--partition"
,
default
=
"learnfair"
,
type
=
str
,
help
=
"Partition where to submit"
)
parser
.
add_argument
(
"--use_volta32"
,
action
=
'store_true'
,
help
=
"Request 32G V100 GPUs"
)
parser
.
add_argument
(
'--comment'
,
default
=
""
,
type
=
str
,
help
=
"Comment to pass to scheduler"
)
return
parser
.
parse_args
()
def
get_shared_folder
()
->
Path
:
user
=
os
.
getenv
(
"USER"
)
if
Path
(
"/checkpoint/"
).
is_dir
():
p
=
Path
(
f
"/checkpoint/
{
user
}
/experiments"
)
p
.
mkdir
(
exist_ok
=
True
)
return
p
raise
RuntimeError
(
"No shared folder available"
)
def
get_init_file
():
# Init file must not exist, but it's parent dir must exist.
os
.
makedirs
(
str
(
get_shared_folder
()),
exist_ok
=
True
)
init_file
=
get_shared_folder
()
/
f
"
{
uuid
.
uuid4
().
hex
}
_init"
if
init_file
.
exists
():
os
.
remove
(
str
(
init_file
))
return
init_file
class
Trainer
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
__call__
(
self
):
import
main_finetune
as
classification
self
.
_setup_gpu_args
()
classification
.
main
(
self
.
args
)
def
checkpoint
(
self
):
import
os
import
submitit
self
.
args
.
dist_url
=
get_init_file
().
as_uri
()
checkpoint_file
=
os
.
path
.
join
(
self
.
args
.
output_dir
,
"checkpoint.pth"
)
if
os
.
path
.
exists
(
checkpoint_file
):
self
.
args
.
resume
=
checkpoint_file
print
(
"Requeuing "
,
self
.
args
)
empty_trainer
=
type
(
self
)(
self
.
args
)
return
submitit
.
helpers
.
DelayedSubmission
(
empty_trainer
)
def
_setup_gpu_args
(
self
):
import
submitit
from
pathlib
import
Path
job_env
=
submitit
.
JobEnvironment
()
self
.
args
.
output_dir
=
Path
(
str
(
self
.
args
.
output_dir
).
replace
(
"%j"
,
str
(
job_env
.
job_id
)))
self
.
args
.
log_dir
=
self
.
args
.
output_dir
self
.
args
.
gpu
=
job_env
.
local_rank
self
.
args
.
rank
=
job_env
.
global_rank
self
.
args
.
world_size
=
job_env
.
num_tasks
print
(
f
"Process group:
{
job_env
.
num_tasks
}
tasks, rank:
{
job_env
.
global_rank
}
"
)
def
main
():
args
=
parse_args
()
if
args
.
job_dir
==
""
:
args
.
job_dir
=
get_shared_folder
()
/
"%j"
# Note that the folder will depend on the job_id, to easily track experiments
executor
=
submitit
.
AutoExecutor
(
folder
=
args
.
job_dir
,
slurm_max_num_timeout
=
30
)
num_gpus_per_node
=
args
.
ngpus
nodes
=
args
.
nodes
timeout_min
=
args
.
timeout
partition
=
args
.
partition
kwargs
=
{}
if
args
.
use_volta32
:
kwargs
[
'slurm_constraint'
]
=
'volta32gb'
if
args
.
comment
:
kwargs
[
'slurm_comment'
]
=
args
.
comment
executor
.
update_parameters
(
mem_gb
=
40
*
num_gpus_per_node
,
gpus_per_node
=
num_gpus_per_node
,
tasks_per_node
=
num_gpus_per_node
,
# one task per GPU
cpus_per_task
=
10
,
nodes
=
nodes
,
timeout_min
=
timeout_min
,
# Below are cluster dependent parameters
slurm_partition
=
partition
,
slurm_signal_delay_s
=
120
,
**
kwargs
)
executor
.
update_parameters
(
name
=
"mae"
)
args
.
dist_url
=
get_init_file
().
as_uri
()
args
.
output_dir
=
args
.
job_dir
trainer
=
Trainer
(
args
)
job
=
executor
.
submit
(
trainer
)
# print("Submitted job_id:", job.job_id)
print
(
job
.
job_id
)
if
__name__
==
"__main__"
:
main
()
PyTorch/NLP/Vision_Transformer/submitit_linprobe.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# A script to run multinode training with submitit.
# --------------------------------------------------------
import
argparse
import
os
import
uuid
from
pathlib
import
Path
import
main_linprobe
as
classification
import
submitit
def
parse_args
():
classification_parser
=
classification
.
get_args_parser
()
parser
=
argparse
.
ArgumentParser
(
"Submitit for MAE linear probe"
,
parents
=
[
classification_parser
])
parser
.
add_argument
(
"--ngpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of gpus to request on each node"
)
parser
.
add_argument
(
"--nodes"
,
default
=
2
,
type
=
int
,
help
=
"Number of nodes to request"
)
parser
.
add_argument
(
"--timeout"
,
default
=
4320
,
type
=
int
,
help
=
"Duration of the job"
)
parser
.
add_argument
(
"--job_dir"
,
default
=
""
,
type
=
str
,
help
=
"Job dir. Leave empty for automatic."
)
parser
.
add_argument
(
"--partition"
,
default
=
"learnfair"
,
type
=
str
,
help
=
"Partition where to submit"
)
parser
.
add_argument
(
"--use_volta32"
,
action
=
'store_true'
,
help
=
"Request 32G V100 GPUs"
)
parser
.
add_argument
(
'--comment'
,
default
=
""
,
type
=
str
,
help
=
"Comment to pass to scheduler"
)
return
parser
.
parse_args
()
def
get_shared_folder
()
->
Path
:
user
=
os
.
getenv
(
"USER"
)
if
Path
(
"/checkpoint/"
).
is_dir
():
p
=
Path
(
f
"/checkpoint/
{
user
}
/experiments"
)
p
.
mkdir
(
exist_ok
=
True
)
return
p
raise
RuntimeError
(
"No shared folder available"
)
def
get_init_file
():
# Init file must not exist, but it's parent dir must exist.
os
.
makedirs
(
str
(
get_shared_folder
()),
exist_ok
=
True
)
init_file
=
get_shared_folder
()
/
f
"
{
uuid
.
uuid4
().
hex
}
_init"
if
init_file
.
exists
():
os
.
remove
(
str
(
init_file
))
return
init_file
class
Trainer
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
__call__
(
self
):
import
main_linprobe
as
classification
self
.
_setup_gpu_args
()
classification
.
main
(
self
.
args
)
def
checkpoint
(
self
):
import
os
import
submitit
self
.
args
.
dist_url
=
get_init_file
().
as_uri
()
checkpoint_file
=
os
.
path
.
join
(
self
.
args
.
output_dir
,
"checkpoint.pth"
)
if
os
.
path
.
exists
(
checkpoint_file
):
self
.
args
.
resume
=
checkpoint_file
print
(
"Requeuing "
,
self
.
args
)
empty_trainer
=
type
(
self
)(
self
.
args
)
return
submitit
.
helpers
.
DelayedSubmission
(
empty_trainer
)
def
_setup_gpu_args
(
self
):
import
submitit
from
pathlib
import
Path
job_env
=
submitit
.
JobEnvironment
()
self
.
args
.
output_dir
=
Path
(
str
(
self
.
args
.
output_dir
).
replace
(
"%j"
,
str
(
job_env
.
job_id
)))
self
.
args
.
log_dir
=
self
.
args
.
output_dir
self
.
args
.
gpu
=
job_env
.
local_rank
self
.
args
.
rank
=
job_env
.
global_rank
self
.
args
.
world_size
=
job_env
.
num_tasks
print
(
f
"Process group:
{
job_env
.
num_tasks
}
tasks, rank:
{
job_env
.
global_rank
}
"
)
def
main
():
args
=
parse_args
()
if
args
.
job_dir
==
""
:
args
.
job_dir
=
get_shared_folder
()
/
"%j"
# Note that the folder will depend on the job_id, to easily track experiments
executor
=
submitit
.
AutoExecutor
(
folder
=
args
.
job_dir
,
slurm_max_num_timeout
=
30
)
num_gpus_per_node
=
args
.
ngpus
nodes
=
args
.
nodes
timeout_min
=
args
.
timeout
partition
=
args
.
partition
kwargs
=
{}
if
args
.
use_volta32
:
kwargs
[
'slurm_constraint'
]
=
'volta32gb'
if
args
.
comment
:
kwargs
[
'slurm_comment'
]
=
args
.
comment
executor
.
update_parameters
(
mem_gb
=
40
*
num_gpus_per_node
,
gpus_per_node
=
num_gpus_per_node
,
tasks_per_node
=
num_gpus_per_node
,
# one task per GPU
cpus_per_task
=
10
,
nodes
=
nodes
,
timeout_min
=
timeout_min
,
# Below are cluster dependent parameters
slurm_partition
=
partition
,
slurm_signal_delay_s
=
120
,
**
kwargs
)
executor
.
update_parameters
(
name
=
"mae"
)
args
.
dist_url
=
get_init_file
().
as_uri
()
args
.
output_dir
=
args
.
job_dir
trainer
=
Trainer
(
args
)
job
=
executor
.
submit
(
trainer
)
# print("Submitted job_id:", job.job_id)
print
(
job
.
job_id
)
if
__name__
==
"__main__"
:
main
()
PyTorch/NLP/Vision_Transformer/submitit_pretrain.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# A script to run multinode training with submitit.
# --------------------------------------------------------
import
argparse
import
os
import
uuid
from
pathlib
import
Path
import
main_pretrain
as
trainer
import
submitit
def
parse_args
():
trainer_parser
=
trainer
.
get_args_parser
()
parser
=
argparse
.
ArgumentParser
(
"Submitit for MAE pretrain"
,
parents
=
[
trainer_parser
])
parser
.
add_argument
(
"--ngpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of gpus to request on each node"
)
parser
.
add_argument
(
"--nodes"
,
default
=
2
,
type
=
int
,
help
=
"Number of nodes to request"
)
parser
.
add_argument
(
"--timeout"
,
default
=
4320
,
type
=
int
,
help
=
"Duration of the job"
)
parser
.
add_argument
(
"--job_dir"
,
default
=
""
,
type
=
str
,
help
=
"Job dir. Leave empty for automatic."
)
parser
.
add_argument
(
"--partition"
,
default
=
"learnfair"
,
type
=
str
,
help
=
"Partition where to submit"
)
parser
.
add_argument
(
"--use_volta32"
,
action
=
'store_true'
,
help
=
"Request 32G V100 GPUs"
)
parser
.
add_argument
(
'--comment'
,
default
=
""
,
type
=
str
,
help
=
"Comment to pass to scheduler"
)
return
parser
.
parse_args
()
def
get_shared_folder
()
->
Path
:
user
=
os
.
getenv
(
"USER"
)
if
Path
(
"/checkpoint/"
).
is_dir
():
p
=
Path
(
f
"/checkpoint/
{
user
}
/experiments"
)
p
.
mkdir
(
exist_ok
=
True
)
return
p
raise
RuntimeError
(
"No shared folder available"
)
def
get_init_file
():
# Init file must not exist, but it's parent dir must exist.
os
.
makedirs
(
str
(
get_shared_folder
()),
exist_ok
=
True
)
init_file
=
get_shared_folder
()
/
f
"
{
uuid
.
uuid4
().
hex
}
_init"
if
init_file
.
exists
():
os
.
remove
(
str
(
init_file
))
return
init_file
class
Trainer
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
__call__
(
self
):
import
main_pretrain
as
trainer
self
.
_setup_gpu_args
()
trainer
.
main
(
self
.
args
)
def
checkpoint
(
self
):
import
os
import
submitit
self
.
args
.
dist_url
=
get_init_file
().
as_uri
()
checkpoint_file
=
os
.
path
.
join
(
self
.
args
.
output_dir
,
"checkpoint.pth"
)
if
os
.
path
.
exists
(
checkpoint_file
):
self
.
args
.
resume
=
checkpoint_file
print
(
"Requeuing "
,
self
.
args
)
empty_trainer
=
type
(
self
)(
self
.
args
)
return
submitit
.
helpers
.
DelayedSubmission
(
empty_trainer
)
def
_setup_gpu_args
(
self
):
import
submitit
from
pathlib
import
Path
job_env
=
submitit
.
JobEnvironment
()
self
.
args
.
output_dir
=
Path
(
str
(
self
.
args
.
output_dir
).
replace
(
"%j"
,
str
(
job_env
.
job_id
)))
self
.
args
.
log_dir
=
self
.
args
.
output_dir
self
.
args
.
gpu
=
job_env
.
local_rank
self
.
args
.
rank
=
job_env
.
global_rank
self
.
args
.
world_size
=
job_env
.
num_tasks
print
(
f
"Process group:
{
job_env
.
num_tasks
}
tasks, rank:
{
job_env
.
global_rank
}
"
)
def
main
():
args
=
parse_args
()
if
args
.
job_dir
==
""
:
args
.
job_dir
=
get_shared_folder
()
/
"%j"
# Note that the folder will depend on the job_id, to easily track experiments
executor
=
submitit
.
AutoExecutor
(
folder
=
args
.
job_dir
,
slurm_max_num_timeout
=
30
)
num_gpus_per_node
=
args
.
ngpus
nodes
=
args
.
nodes
timeout_min
=
args
.
timeout
partition
=
args
.
partition
kwargs
=
{}
if
args
.
use_volta32
:
kwargs
[
'slurm_constraint'
]
=
'volta32gb'
if
args
.
comment
:
kwargs
[
'slurm_comment'
]
=
args
.
comment
executor
.
update_parameters
(
mem_gb
=
40
*
num_gpus_per_node
,
gpus_per_node
=
num_gpus_per_node
,
tasks_per_node
=
num_gpus_per_node
,
# one task per GPU
cpus_per_task
=
10
,
nodes
=
nodes
,
timeout_min
=
timeout_min
,
# max is 60 * 72
# Below are cluster dependent parameters
slurm_partition
=
partition
,
slurm_signal_delay_s
=
120
,
**
kwargs
)
executor
.
update_parameters
(
name
=
"mae"
)
args
.
dist_url
=
get_init_file
().
as_uri
()
args
.
output_dir
=
args
.
job_dir
trainer
=
Trainer
(
args
)
job
=
executor
.
submit
(
trainer
)
# print("Submitted job_id:", job.job_id)
print
(
job
.
job_id
)
if
__name__
==
"__main__"
:
main
()
PyTorch/NLP/Vision_Transformer/util/crop.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import
math
import
torch
from
torchvision
import
transforms
from
torchvision.transforms
import
functional
as
F
class
RandomResizedCrop
(
transforms
.
RandomResizedCrop
):
"""
RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
This may lead to results different with torchvision's version.
Following BYOL's TF code:
https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
"""
@
staticmethod
def
get_params
(
img
,
scale
,
ratio
):
width
,
height
=
F
.
_get_image_size
(
img
)
area
=
height
*
width
target_area
=
area
*
torch
.
empty
(
1
).
uniform_
(
scale
[
0
],
scale
[
1
]).
item
()
log_ratio
=
torch
.
log
(
torch
.
tensor
(
ratio
))
aspect_ratio
=
torch
.
exp
(
torch
.
empty
(
1
).
uniform_
(
log_ratio
[
0
],
log_ratio
[
1
])
).
item
()
w
=
int
(
round
(
math
.
sqrt
(
target_area
*
aspect_ratio
)))
h
=
int
(
round
(
math
.
sqrt
(
target_area
/
aspect_ratio
)))
w
=
min
(
w
,
width
)
h
=
min
(
h
,
height
)
i
=
torch
.
randint
(
0
,
height
-
h
+
1
,
size
=
(
1
,)).
item
()
j
=
torch
.
randint
(
0
,
width
-
w
+
1
,
size
=
(
1
,)).
item
()
return
i
,
j
,
h
,
w
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/datasets.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DeiT: https://github.com/facebookresearch/deit
# --------------------------------------------------------
import
os
import
PIL
from
torchvision
import
datasets
,
transforms
from
timm.data
import
create_transform
from
timm.data.constants
import
IMAGENET_DEFAULT_MEAN
,
IMAGENET_DEFAULT_STD
def
build_dataset
(
is_train
,
args
):
transform
=
build_transform
(
is_train
,
args
)
root
=
os
.
path
.
join
(
args
.
data_path
,
'train'
if
is_train
else
'val'
)
dataset
=
datasets
.
ImageFolder
(
root
,
transform
=
transform
)
print
(
dataset
)
return
dataset
def
build_transform
(
is_train
,
args
):
mean
=
IMAGENET_DEFAULT_MEAN
std
=
IMAGENET_DEFAULT_STD
# train transform
if
is_train
:
# this should always dispatch to transforms_imagenet_train
transform
=
create_transform
(
input_size
=
args
.
input_size
,
is_training
=
True
,
color_jitter
=
args
.
color_jitter
,
auto_augment
=
args
.
aa
,
interpolation
=
'bicubic'
,
re_prob
=
args
.
reprob
,
re_mode
=
args
.
remode
,
re_count
=
args
.
recount
,
mean
=
mean
,
std
=
std
,
)
return
transform
# eval transform
t
=
[]
if
args
.
input_size
<=
224
:
crop_pct
=
224
/
256
else
:
crop_pct
=
1.0
size
=
int
(
args
.
input_size
/
crop_pct
)
t
.
append
(
transforms
.
Resize
(
size
,
interpolation
=
PIL
.
Image
.
BICUBIC
),
# to maintain same ratio w.r.t. 224 images
)
t
.
append
(
transforms
.
CenterCrop
(
args
.
input_size
))
t
.
append
(
transforms
.
ToTensor
())
t
.
append
(
transforms
.
Normalize
(
mean
,
std
))
return
transforms
.
Compose
(
t
)
PyTorch/NLP/Vision_Transformer/util/lars.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# LARS optimizer, implementation from MoCo v3:
# https://github.com/facebookresearch/moco-v3
# --------------------------------------------------------
import
torch
class
LARS
(
torch
.
optim
.
Optimizer
):
"""
LARS optimizer, no rate scaling or weight decay for parameters <= 1D.
"""
def
__init__
(
self
,
params
,
lr
=
0
,
weight_decay
=
0
,
momentum
=
0.9
,
trust_coefficient
=
0.001
):
defaults
=
dict
(
lr
=
lr
,
weight_decay
=
weight_decay
,
momentum
=
momentum
,
trust_coefficient
=
trust_coefficient
)
super
().
__init__
(
params
,
defaults
)
@
torch
.
no_grad
()
def
step
(
self
):
for
g
in
self
.
param_groups
:
for
p
in
g
[
'params'
]:
dp
=
p
.
grad
if
dp
is
None
:
continue
if
p
.
ndim
>
1
:
# if not normalization gamma/beta or bias
dp
=
dp
.
add
(
p
,
alpha
=
g
[
'weight_decay'
])
param_norm
=
torch
.
norm
(
p
)
update_norm
=
torch
.
norm
(
dp
)
one
=
torch
.
ones_like
(
param_norm
)
q
=
torch
.
where
(
param_norm
>
0.
,
torch
.
where
(
update_norm
>
0
,
(
g
[
'trust_coefficient'
]
*
param_norm
/
update_norm
),
one
),
one
)
dp
=
dp
.
mul
(
q
)
param_state
=
self
.
state
[
p
]
if
'mu'
not
in
param_state
:
param_state
[
'mu'
]
=
torch
.
zeros_like
(
p
)
mu
=
param_state
[
'mu'
]
mu
.
mul_
(
g
[
'momentum'
]).
add_
(
dp
)
p
.
add_
(
mu
,
alpha
=-
g
[
'lr'
])
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/lr_decay.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# ELECTRA https://github.com/google-research/electra
# BEiT: https://github.com/microsoft/unilm/tree/master/beit
# --------------------------------------------------------
import
json
def
param_groups_lrd
(
model
,
weight_decay
=
0.05
,
no_weight_decay_list
=
[],
layer_decay
=
.
75
):
"""
Parameter groups for layer-wise lr decay
Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
"""
param_group_names
=
{}
param_groups
=
{}
num_layers
=
len
(
model
.
blocks
)
+
1
layer_scales
=
list
(
layer_decay
**
(
num_layers
-
i
)
for
i
in
range
(
num_layers
+
1
))
for
n
,
p
in
model
.
named_parameters
():
if
not
p
.
requires_grad
:
continue
# no decay: all 1D parameters and model specific ones
if
p
.
ndim
==
1
or
n
in
no_weight_decay_list
:
g_decay
=
"no_decay"
this_decay
=
0.
else
:
g_decay
=
"decay"
this_decay
=
weight_decay
layer_id
=
get_layer_id_for_vit
(
n
,
num_layers
)
group_name
=
"layer_%d_%s"
%
(
layer_id
,
g_decay
)
if
group_name
not
in
param_group_names
:
this_scale
=
layer_scales
[
layer_id
]
param_group_names
[
group_name
]
=
{
"lr_scale"
:
this_scale
,
"weight_decay"
:
this_decay
,
"params"
:
[],
}
param_groups
[
group_name
]
=
{
"lr_scale"
:
this_scale
,
"weight_decay"
:
this_decay
,
"params"
:
[],
}
param_group_names
[
group_name
][
"params"
].
append
(
n
)
param_groups
[
group_name
][
"params"
].
append
(
p
)
# print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
return
list
(
param_groups
.
values
())
def
get_layer_id_for_vit
(
name
,
num_layers
):
"""
Assign a parameter with its layer id
Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
"""
if
name
in
[
'cls_token'
,
'pos_embed'
]:
return
0
elif
name
.
startswith
(
'patch_embed'
):
return
0
elif
name
.
startswith
(
'blocks'
):
return
int
(
name
.
split
(
'.'
)[
1
])
+
1
else
:
return
num_layers
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/lr_sched.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import
math
def
adjust_learning_rate
(
optimizer
,
epoch
,
args
):
"""Decay the learning rate with half-cycle cosine after warmup"""
if
epoch
<
args
.
warmup_epochs
:
lr
=
args
.
lr
*
epoch
/
args
.
warmup_epochs
else
:
lr
=
args
.
min_lr
+
(
args
.
lr
-
args
.
min_lr
)
*
0.5
*
\
(
1.
+
math
.
cos
(
math
.
pi
*
(
epoch
-
args
.
warmup_epochs
)
/
(
args
.
epochs
-
args
.
warmup_epochs
)))
for
param_group
in
optimizer
.
param_groups
:
if
"lr_scale"
in
param_group
:
param_group
[
"lr"
]
=
lr
*
param_group
[
"lr_scale"
]
else
:
param_group
[
"lr"
]
=
lr
return
lr
PyTorch/NLP/Vision_Transformer/util/misc.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DeiT: https://github.com/facebookresearch/deit
# BEiT: https://github.com/microsoft/unilm/tree/master/beit
# --------------------------------------------------------
import
builtins
import
datetime
import
os
import
time
from
collections
import
defaultdict
,
deque
from
pathlib
import
Path
import
torch
import
torch.distributed
as
dist
from
torch._six
import
inf
class
SmoothedValue
(
object
):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def
__init__
(
self
,
window_size
=
20
,
fmt
=
None
):
if
fmt
is
None
:
fmt
=
"{median:.4f} ({global_avg:.4f})"
self
.
deque
=
deque
(
maxlen
=
window_size
)
self
.
total
=
0.0
self
.
count
=
0
self
.
fmt
=
fmt
def
update
(
self
,
value
,
n
=
1
):
self
.
deque
.
append
(
value
)
self
.
count
+=
n
self
.
total
+=
value
*
n
def
synchronize_between_processes
(
self
):
"""
Warning: does not synchronize the deque!
"""
if
not
is_dist_avail_and_initialized
():
return
t
=
torch
.
tensor
([
self
.
count
,
self
.
total
],
dtype
=
torch
.
float64
,
device
=
'cuda'
)
dist
.
barrier
()
dist
.
all_reduce
(
t
)
t
=
t
.
tolist
()
self
.
count
=
int
(
t
[
0
])
self
.
total
=
t
[
1
]
@
property
def
median
(
self
):
d
=
torch
.
tensor
(
list
(
self
.
deque
))
return
d
.
median
().
item
()
@
property
def
avg
(
self
):
d
=
torch
.
tensor
(
list
(
self
.
deque
),
dtype
=
torch
.
float32
)
return
d
.
mean
().
item
()
@
property
def
global_avg
(
self
):
return
self
.
total
/
self
.
count
@
property
def
max
(
self
):
return
max
(
self
.
deque
)
@
property
def
value
(
self
):
return
self
.
deque
[
-
1
]
def
__str__
(
self
):
return
self
.
fmt
.
format
(
median
=
self
.
median
,
avg
=
self
.
avg
,
global_avg
=
self
.
global_avg
,
max
=
self
.
max
,
value
=
self
.
value
)
class
MetricLogger
(
object
):
def
__init__
(
self
,
delimiter
=
"
\t
"
):
self
.
meters
=
defaultdict
(
SmoothedValue
)
self
.
delimiter
=
delimiter
def
update
(
self
,
**
kwargs
):
for
k
,
v
in
kwargs
.
items
():
if
v
is
None
:
continue
if
isinstance
(
v
,
torch
.
Tensor
):
v
=
v
.
item
()
assert
isinstance
(
v
,
(
float
,
int
))
self
.
meters
[
k
].
update
(
v
)
def
__getattr__
(
self
,
attr
):
if
attr
in
self
.
meters
:
return
self
.
meters
[
attr
]
if
attr
in
self
.
__dict__
:
return
self
.
__dict__
[
attr
]
raise
AttributeError
(
"'{}' object has no attribute '{}'"
.
format
(
type
(
self
).
__name__
,
attr
))
def
__str__
(
self
):
loss_str
=
[]
for
name
,
meter
in
self
.
meters
.
items
():
loss_str
.
append
(
"{}: {}"
.
format
(
name
,
str
(
meter
))
)
return
self
.
delimiter
.
join
(
loss_str
)
def
synchronize_between_processes
(
self
):
for
meter
in
self
.
meters
.
values
():
meter
.
synchronize_between_processes
()
def
add_meter
(
self
,
name
,
meter
):
self
.
meters
[
name
]
=
meter
def
log_every
(
self
,
iterable
,
print_freq
,
header
=
None
):
i
=
0
if
not
header
:
header
=
''
start_time
=
time
.
time
()
end
=
time
.
time
()
iter_time
=
SmoothedValue
(
fmt
=
'{avg:.4f}'
)
data_time
=
SmoothedValue
(
fmt
=
'{avg:.4f}'
)
space_fmt
=
':'
+
str
(
len
(
str
(
len
(
iterable
))))
+
'd'
log_msg
=
[
header
,
'[{0'
+
space_fmt
+
'}/{1}]'
,
'eta: {eta}'
,
'{meters}'
,
'time: {time}'
,
'data: {data}'
]
if
torch
.
cuda
.
is_available
():
log_msg
.
append
(
'max mem: {memory:.0f}'
)
log_msg
=
self
.
delimiter
.
join
(
log_msg
)
MB
=
1024.0
*
1024.0
for
obj
in
iterable
:
data_time
.
update
(
time
.
time
()
-
end
)
yield
obj
iter_time
.
update
(
time
.
time
()
-
end
)
if
i
%
print_freq
==
0
or
i
==
len
(
iterable
)
-
1
:
eta_seconds
=
iter_time
.
global_avg
*
(
len
(
iterable
)
-
i
)
eta_string
=
str
(
datetime
.
timedelta
(
seconds
=
int
(
eta_seconds
)))
if
torch
.
cuda
.
is_available
():
print
(
log_msg
.
format
(
i
,
len
(
iterable
),
eta
=
eta_string
,
meters
=
str
(
self
),
time
=
str
(
iter_time
),
data
=
str
(
data_time
),
memory
=
torch
.
cuda
.
max_memory_allocated
()
/
MB
))
else
:
print
(
log_msg
.
format
(
i
,
len
(
iterable
),
eta
=
eta_string
,
meters
=
str
(
self
),
time
=
str
(
iter_time
),
data
=
str
(
data_time
)))
i
+=
1
end
=
time
.
time
()
total_time
=
time
.
time
()
-
start_time
total_time_str
=
str
(
datetime
.
timedelta
(
seconds
=
int
(
total_time
)))
print
(
'{} Total time: {} ({:.4f} s / it)'
.
format
(
header
,
total_time_str
,
total_time
/
len
(
iterable
)))
def
setup_for_distributed
(
is_master
):
"""
This function disables printing when not in master process
"""
builtin_print
=
builtins
.
print
def
print
(
*
args
,
**
kwargs
):
force
=
kwargs
.
pop
(
'force'
,
False
)
force
=
force
or
(
get_world_size
()
>
8
)
if
is_master
or
force
:
now
=
datetime
.
datetime
.
now
().
time
()
builtin_print
(
'[{}] '
.
format
(
now
),
end
=
''
)
# print with time stamp
builtin_print
(
*
args
,
**
kwargs
)
builtins
.
print
=
print
def
is_dist_avail_and_initialized
():
if
not
dist
.
is_available
():
return
False
if
not
dist
.
is_initialized
():
return
False
return
True
def
get_world_size
():
if
not
is_dist_avail_and_initialized
():
return
1
return
dist
.
get_world_size
()
def
get_rank
():
if
not
is_dist_avail_and_initialized
():
return
0
return
dist
.
get_rank
()
def
is_main_process
():
return
get_rank
()
==
0
def
save_on_master
(
*
args
,
**
kwargs
):
if
is_main_process
():
torch
.
save
(
*
args
,
**
kwargs
)
def
init_distributed_mode
(
args
):
if
args
.
dist_on_itp
:
args
.
rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
args
.
world_size
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_SIZE'
])
args
.
gpu
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_LOCAL_RANK'
])
args
.
dist_url
=
"tcp://%s:%s"
%
(
os
.
environ
[
'MASTER_ADDR'
],
os
.
environ
[
'MASTER_PORT'
])
os
.
environ
[
'LOCAL_RANK'
]
=
str
(
args
.
gpu
)
os
.
environ
[
'RANK'
]
=
str
(
args
.
rank
)
os
.
environ
[
'WORLD_SIZE'
]
=
str
(
args
.
world_size
)
# ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
elif
'RANK'
in
os
.
environ
and
'WORLD_SIZE'
in
os
.
environ
:
args
.
rank
=
int
(
os
.
environ
[
"RANK"
])
args
.
world_size
=
int
(
os
.
environ
[
'WORLD_SIZE'
])
args
.
gpu
=
int
(
os
.
environ
[
'LOCAL_RANK'
])
elif
'SLURM_PROCID'
in
os
.
environ
:
args
.
rank
=
int
(
os
.
environ
[
'SLURM_PROCID'
])
args
.
gpu
=
args
.
rank
%
torch
.
cuda
.
device_count
()
else
:
print
(
'Not using distributed mode'
)
setup_for_distributed
(
is_master
=
True
)
# hack
args
.
distributed
=
False
return
args
.
distributed
=
True
torch
.
cuda
.
set_device
(
args
.
gpu
)
args
.
dist_backend
=
'nccl'
print
(
'| distributed init (rank {}): {}, gpu {}'
.
format
(
args
.
rank
,
args
.
dist_url
,
args
.
gpu
),
flush
=
True
)
torch
.
distributed
.
init_process_group
(
backend
=
args
.
dist_backend
,
init_method
=
args
.
dist_url
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
)
torch
.
distributed
.
barrier
()
setup_for_distributed
(
args
.
rank
==
0
)
class
NativeScalerWithGradNormCount
:
state_dict_key
=
"amp_scaler"
def
__init__
(
self
):
self
.
_scaler
=
torch
.
cuda
.
amp
.
GradScaler
()
def
__call__
(
self
,
loss
,
optimizer
,
clip_grad
=
None
,
parameters
=
None
,
create_graph
=
False
,
update_grad
=
True
):
self
.
_scaler
.
scale
(
loss
).
backward
(
create_graph
=
create_graph
)
if
update_grad
:
if
clip_grad
is
not
None
:
assert
parameters
is
not
None
self
.
_scaler
.
unscale_
(
optimizer
)
# unscale the gradients of optimizer's assigned params in-place
norm
=
torch
.
nn
.
utils
.
clip_grad_norm_
(
parameters
,
clip_grad
)
else
:
self
.
_scaler
.
unscale_
(
optimizer
)
norm
=
get_grad_norm_
(
parameters
)
self
.
_scaler
.
step
(
optimizer
)
self
.
_scaler
.
update
()
else
:
norm
=
None
return
norm
def
state_dict
(
self
):
return
self
.
_scaler
.
state_dict
()
def
load_state_dict
(
self
,
state_dict
):
self
.
_scaler
.
load_state_dict
(
state_dict
)
def
get_grad_norm_
(
parameters
,
norm_type
:
float
=
2.0
)
->
torch
.
Tensor
:
if
isinstance
(
parameters
,
torch
.
Tensor
):
parameters
=
[
parameters
]
parameters
=
[
p
for
p
in
parameters
if
p
.
grad
is
not
None
]
norm_type
=
float
(
norm_type
)
if
len
(
parameters
)
==
0
:
return
torch
.
tensor
(
0.
)
device
=
parameters
[
0
].
grad
.
device
if
norm_type
==
inf
:
total_norm
=
max
(
p
.
grad
.
detach
().
abs
().
max
().
to
(
device
)
for
p
in
parameters
)
else
:
total_norm
=
torch
.
norm
(
torch
.
stack
([
torch
.
norm
(
p
.
grad
.
detach
(),
norm_type
).
to
(
device
)
for
p
in
parameters
]),
norm_type
)
return
total_norm
def
save_model
(
args
,
epoch
,
model
,
model_without_ddp
,
optimizer
,
loss_scaler
):
output_dir
=
Path
(
args
.
output_dir
)
epoch_name
=
str
(
epoch
)
if
loss_scaler
is
not
None
:
checkpoint_paths
=
[
output_dir
/
(
'checkpoint-%s.pth'
%
epoch_name
)]
for
checkpoint_path
in
checkpoint_paths
:
to_save
=
{
'model'
:
model_without_ddp
.
state_dict
(),
'optimizer'
:
optimizer
.
state_dict
(),
'epoch'
:
epoch
,
'scaler'
:
loss_scaler
.
state_dict
(),
'args'
:
args
,
}
save_on_master
(
to_save
,
checkpoint_path
)
else
:
client_state
=
{
'epoch'
:
epoch
}
model
.
save_checkpoint
(
save_dir
=
args
.
output_dir
,
tag
=
"checkpoint-%s"
%
epoch_name
,
client_state
=
client_state
)
def
load_model
(
args
,
model_without_ddp
,
optimizer
,
loss_scaler
):
if
args
.
resume
:
if
args
.
resume
.
startswith
(
'https'
):
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
args
.
resume
,
map_location
=
'cpu'
,
check_hash
=
True
)
else
:
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
'cpu'
)
model_without_ddp
.
load_state_dict
(
checkpoint
[
'model'
])
print
(
"Resume checkpoint %s"
%
args
.
resume
)
if
'optimizer'
in
checkpoint
and
'epoch'
in
checkpoint
and
not
(
hasattr
(
args
,
'eval'
)
and
args
.
eval
):
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
args
.
start_epoch
=
checkpoint
[
'epoch'
]
+
1
if
'scaler'
in
checkpoint
:
loss_scaler
.
load_state_dict
(
checkpoint
[
'scaler'
])
print
(
"With optim & sched!"
)
def
all_reduce_mean
(
x
):
world_size
=
get_world_size
()
if
world_size
>
1
:
x_reduce
=
torch
.
tensor
(
x
).
cuda
()
dist
.
all_reduce
(
x_reduce
)
x_reduce
/=
world_size
return
x_reduce
.
item
()
else
:
return
x
\ No newline at end of file
PyTorch/NLP/Vision_Transformer/util/pos_embed.py
0 → 100644
View file @
c056df78
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# Position embedding utils
# --------------------------------------------------------
import
numpy
as
np
import
torch
# --------------------------------------------------------
# 2D sine-cosine position embedding
# References:
# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
# MoCo v3: https://github.com/facebookresearch/moco-v3
# --------------------------------------------------------
def
get_2d_sincos_pos_embed
(
embed_dim
,
grid_size
,
cls_token
=
False
):
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
grid_h
=
np
.
arange
(
grid_size
,
dtype
=
np
.
float32
)
grid_w
=
np
.
arange
(
grid_size
,
dtype
=
np
.
float32
)
grid
=
np
.
meshgrid
(
grid_w
,
grid_h
)
# here w goes first
grid
=
np
.
stack
(
grid
,
axis
=
0
)
grid
=
grid
.
reshape
([
2
,
1
,
grid_size
,
grid_size
])
pos_embed
=
get_2d_sincos_pos_embed_from_grid
(
embed_dim
,
grid
)
if
cls_token
:
pos_embed
=
np
.
concatenate
([
np
.
zeros
([
1
,
embed_dim
]),
pos_embed
],
axis
=
0
)
return
pos_embed
def
get_2d_sincos_pos_embed_from_grid
(
embed_dim
,
grid
):
assert
embed_dim
%
2
==
0
# use half of dimensions to encode grid_h
emb_h
=
get_1d_sincos_pos_embed_from_grid
(
embed_dim
//
2
,
grid
[
0
])
# (H*W, D/2)
emb_w
=
get_1d_sincos_pos_embed_from_grid
(
embed_dim
//
2
,
grid
[
1
])
# (H*W, D/2)
emb
=
np
.
concatenate
([
emb_h
,
emb_w
],
axis
=
1
)
# (H*W, D)
return
emb
def
get_1d_sincos_pos_embed_from_grid
(
embed_dim
,
pos
):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert
embed_dim
%
2
==
0
omega
=
np
.
arange
(
embed_dim
//
2
,
dtype
=
np
.
float
)
omega
/=
embed_dim
/
2.
omega
=
1.
/
10000
**
omega
# (D/2,)
pos
=
pos
.
reshape
(
-
1
)
# (M,)
out
=
np
.
einsum
(
'm,d->md'
,
pos
,
omega
)
# (M, D/2), outer product
emb_sin
=
np
.
sin
(
out
)
# (M, D/2)
emb_cos
=
np
.
cos
(
out
)
# (M, D/2)
emb
=
np
.
concatenate
([
emb_sin
,
emb_cos
],
axis
=
1
)
# (M, D)
return
emb
# --------------------------------------------------------
# Interpolate position embeddings for high-resolution
# References:
# DeiT: https://github.com/facebookresearch/deit
# --------------------------------------------------------
def
interpolate_pos_embed
(
model
,
checkpoint_model
):
if
'pos_embed'
in
checkpoint_model
:
pos_embed_checkpoint
=
checkpoint_model
[
'pos_embed'
]
embedding_size
=
pos_embed_checkpoint
.
shape
[
-
1
]
num_patches
=
model
.
patch_embed
.
num_patches
num_extra_tokens
=
model
.
pos_embed
.
shape
[
-
2
]
-
num_patches
# height (== width) for the checkpoint position embedding
orig_size
=
int
((
pos_embed_checkpoint
.
shape
[
-
2
]
-
num_extra_tokens
)
**
0.5
)
# height (== width) for the new position embedding
new_size
=
int
(
num_patches
**
0.5
)
# class_token and dist_token are kept unchanged
if
orig_size
!=
new_size
:
print
(
"Position interpolate from %dx%d to %dx%d"
%
(
orig_size
,
orig_size
,
new_size
,
new_size
))
extra_tokens
=
pos_embed_checkpoint
[:,
:
num_extra_tokens
]
# only the position tokens are interpolated
pos_tokens
=
pos_embed_checkpoint
[:,
num_extra_tokens
:]
pos_tokens
=
pos_tokens
.
reshape
(
-
1
,
orig_size
,
orig_size
,
embedding_size
).
permute
(
0
,
3
,
1
,
2
)
pos_tokens
=
torch
.
nn
.
functional
.
interpolate
(
pos_tokens
,
size
=
(
new_size
,
new_size
),
mode
=
'bicubic'
,
align_corners
=
False
)
pos_tokens
=
pos_tokens
.
permute
(
0
,
2
,
3
,
1
).
flatten
(
1
,
2
)
new_pos_embed
=
torch
.
cat
((
extra_tokens
,
pos_tokens
),
dim
=
1
)
checkpoint_model
[
'pos_embed'
]
=
new_pos_embed
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment