Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
404ecbdc
"applications/ColossalQA/colossalqa/local/__init__.py" did not exist on "b0ce5a10326912961f0bc07cbbd250bab7b9c399"
Commit
404ecbdc
authored
Oct 28, 2021
by
zbian
Browse files
Migrated project
parent
2ebaefc5
Changes
409
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
863 additions
and
0 deletions
+863
-0
tests/test_utils/test_activation_checkpointing.py
tests/test_utils/test_activation_checkpointing.py
+60
-0
tests/test_zero_data_parallel/config.py
tests/test_zero_data_parallel/config.py
+91
-0
tests/test_zero_data_parallel/test_zero.py
tests/test_zero_data_parallel/test_zero.py
+153
-0
tests/test_zero_data_parallel/test_zero.sh
tests/test_zero_data_parallel/test_zero.sh
+4
-0
tests/test_zero_tensor_parallel/configs/vit_2d_zero1.py
tests/test_zero_tensor_parallel/configs/vit_2d_zero1.py
+159
-0
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
+149
-0
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
+149
-0
tests/test_zero_tensor_parallel/test.sh
tests/test_zero_tensor_parallel/test.sh
+4
-0
tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py
tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py
+94
-0
No files found.
tests/test_utils/test_activation_checkpointing.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
pytest
import
torch
import
torch.nn.functional
as
F
from
torch.utils.checkpoint
import
checkpoint
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.context.random
import
add_seed
,
seed
,
set_mode
from
colossalai.utils
import
checkpoint
def
forward
(
x
,
weight
):
out
=
torch
.
matmul
(
x
,
weight
)
with
seed
(
ParallelMode
.
DATA
):
out_
=
F
.
dropout
(
out
,
p
=
0.4
,
training
=
True
)
return
out_
@
pytest
.
mark
.
gpu
def
test_activation_checkpointing
():
add_seed
(
ParallelMode
.
GLOBAL
,
1024
)
set_mode
(
ParallelMode
.
GLOBAL
)
global_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
add_seed
(
ParallelMode
.
DATA
,
1026
)
set_mode
(
ParallelMode
.
DATA
)
data_parallel_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
set_mode
(
ParallelMode
.
GLOBAL
)
# normal
data
=
torch
.
rand
(
2
,
2
,
requires_grad
=
True
).
cuda
()
data
.
retain_grad
()
weight
=
torch
.
rand
(
2
,
4
,
requires_grad
=
True
).
cuda
()
data_
=
data
.
clone
().
detach
()
data_
.
requires_grad
=
True
data_
.
retain_grad
()
weight_
=
weight
.
clone
().
detach
()
weight_
.
requires_grad
=
True
out
=
forward
(
data
,
weight
)
loss
=
out
.
sum
()
loss
.
backward
()
# checkpoint
set_mode
(
ParallelMode
.
GLOBAL
)
torch
.
cuda
.
set_rng_state
(
global_cuda_rng_state
)
set_mode
(
ParallelMode
.
DATA
)
torch
.
cuda
.
set_rng_state
(
data_parallel_cuda_rng_state
)
set_mode
(
ParallelMode
.
GLOBAL
)
out
=
checkpoint
(
forward
,
data_
,
weight_
)
loss
=
out
.
sum
()
loss
.
backward
()
assert
torch
.
all
(
data
.
grad
==
data_
.
grad
),
'Gradient of the input does not match'
if
__name__
==
'__main__'
:
test_activation_checkpointing
()
tests/test_zero_data_parallel/config.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
from
pathlib
import
Path
BATCH_SIZE
=
128
IMG_SIZE
=
224
NUM_CLS
=
1000
# resnet 18
model
=
dict
(
type
=
'VanillaResNet'
,
block_type
=
'ResNetBottleneck'
,
layers
=
[
3
,
4
,
6
,
3
],
num_cls
=
NUM_CLS
)
train_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
transform_pipeline
=
[
dict
(
type
=
'RandomResizedCrop'
,
size
=
IMG_SIZE
),
dict
(
type
=
'RandomHorizontalFlip'
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
]
),
dataloader
=
dict
(
batch_size
=
64
,
pin_memory
=
True
,
num_workers
=
4
,
sampler
=
dict
(
type
=
'DataParallelSampler'
,
shuffle
=
True
,
)
)
)
test_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
train
=
False
,
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
(
IMG_SIZE
,
IMG_SIZE
)),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
4
,
)
)
dist_initializer
=
[
dict
(
type
=
'DataParallelInitializer'
),
]
parallelization
=
dict
(
pipeline
=
1
,
tensor
=
1
,
sequence
=-
1
)
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.01
)
loss
=
dict
(
type
=
'CrossEntropyLoss'
)
trainer
=
dict
(
max_epochs
=
5
,
max_iters
=
1000
)
amp
=
dict
(
fp16
=
None
,
)
level
=
2
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
1
,
mode
=
None
)
)
tests/test_zero_data_parallel/test_zero.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os.path
as
osp
import
pytest
import
torch
from
torch.utils.data
import
DataLoader
import
colossalai
from
colossalai.builder
import
build_dataset
,
build_loss
,
build_data_sampler
,
build_model
from
colossalai.core
import
global_context
from
colossalai.engine.gradient_handler
import
DataParallelGradientHandler
from
colossalai.nn.optimizer
import
ZeroRedundancyOptimizer_Level_1
,
ZeroRedundancyOptimizer_Level_3
,
\
ZeroRedundancyOptimizer_Level_2
from
colossalai.utils
import
print_rank_0
DIR_PATH
=
osp
.
dirname
(
osp
.
abspath
(
__file__
))
CONFIG_PATH
=
osp
.
join
(
DIR_PATH
,
'config.py'
)
def
run_dist
():
colossalai
.
init_dist
(
CONFIG_PATH
)
# build resnet model
model
=
build_model
(
global_context
.
config
.
model
)
model
.
build_from_cfg
()
model
=
model
.
cuda
()
level
=
global_context
.
config
.
level
if
level
>
1
:
model
=
model
.
half
()
# test init cuda memory
_
=
torch
.
rand
(
1
).
cuda
()
torch
.
cuda
.
synchronize
()
max_alloc
=
torch
.
cuda
.
max_memory_allocated
()
max_reserved
=
torch
.
cuda
.
max_memory_reserved
()
print
(
f
'before run: max_allocation =
{
max_alloc
}
, max_reserved =
{
max_reserved
}
'
)
# build dataloader
train_dataset
=
build_dataset
(
global_context
.
config
.
train_data
.
dataset
)
sampler_cfg
=
global_context
.
config
.
train_data
.
dataloader
.
pop
(
'sampler'
,
None
)
if
sampler_cfg
is
None
:
train_dataloader
=
DataLoader
(
dataset
=
train_dataset
,
**
global_context
.
config
.
train_data
.
dataloader
)
else
:
sampler
=
build_data_sampler
(
sampler_cfg
,
train_dataset
)
train_dataloader
=
DataLoader
(
dataset
=
train_dataset
,
sampler
=
sampler
,
**
global_context
.
config
.
train_data
.
dataloader
)
test_dataset
=
build_dataset
(
global_context
.
config
.
test_data
.
dataset
)
test_dataloader
=
DataLoader
(
dataset
=
test_dataset
,
**
global_context
.
config
.
test_data
.
dataloader
)
# build optimizer and loss
# optimizer = build_optimizer(global_context.config.optimizer, model)
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
0.001
)
if
level
==
1
:
zero_optim
=
ZeroRedundancyOptimizer_Level_1
(
init_optimizer
=
optimizer
,
verbose
=
False
)
elif
level
==
2
:
zero_optim
=
ZeroRedundancyOptimizer_Level_2
(
init_optimizer
=
optimizer
,
cpu_offload
=
True
,
verbose
=
False
)
elif
level
==
3
:
zero_optim
=
ZeroRedundancyOptimizer_Level_3
(
init_optimizer
=
optimizer
,
module
=
model
,
verbose
=
False
,
offload_optimizer_config
=
dict
(
device
=
'cpu'
,
pin_memory
=
True
,
buffer_count
=
5
,
fast_init
=
False
),
offload_param_config
=
dict
(
device
=
'cpu'
,
pin_memory
=
True
,
buffer_count
=
5
,
buffer_size
=
1e8
,
max_in_cpu
=
1e9
)
)
loss_fn
=
build_loss
(
global_context
.
config
.
loss
)
gradient_handler
=
DataParallelGradientHandler
(
model
,
zero_optim
)
# train
for
epoch
in
range
(
100
):
model
.
train
()
# train
avg_train_loss
=
0
train_iter
=
0
for
idx
,
(
data
,
label
)
in
enumerate
(
train_dataloader
):
# model = model.half()
data
=
data
[
0
].
cuda
()
label
=
label
[
0
].
cuda
()
if
level
>
1
:
data
=
data
.
half
()
output
=
model
(
data
)
loss
=
loss_fn
(
output
[
0
],
label
)
if
level
>
1
:
zero_optim
.
backward
(
loss
)
zero_optim
.
overlapping_partition_gradients_reduce_epilogue
()
else
:
loss
.
backward
()
gradient_handler
.
handle_gradient
()
zero_optim
.
step
()
zero_optim
.
zero_grad
()
avg_train_loss
+=
loss
.
detach
().
cpu
().
numpy
()
train_iter
+=
1
print_rank_0
(
f
'epoch:
{
epoch
}
, train loss:
{
avg_train_loss
/
train_iter
}
'
)
if
epoch
%
2
==
0
:
model
.
eval
()
avg_eval_loss
=
0
correct
=
0
total
=
0
eval_iters
=
0
for
idx
,
(
data
,
label
)
in
enumerate
(
test_dataloader
):
with
torch
.
no_grad
():
data
=
data
[
0
].
cuda
()
label
=
label
[
0
].
cuda
()
if
level
>
1
:
data
=
data
.
half
()
output
=
model
(
data
)
loss
=
loss_fn
(
output
[
0
],
label
)
avg_eval_loss
+=
loss
.
detach
().
cpu
().
numpy
()
preds
=
torch
.
argmax
(
output
[
0
],
dim
=
1
)
total
+=
data
.
size
(
0
)
correct
+=
sum
(
preds
==
label
)
eval_iters
+=
1
print_rank_0
(
f
'epoch:
{
epoch
}
, eval loss:
{
avg_eval_loss
/
eval_iters
}
, acc:
{
correct
/
total
}
'
)
@
pytest
.
mark
.
skip
(
"This test should be invoked manually using the script provided"
)
@
pytest
.
mark
.
dist
def
test_zero
():
run_dist
()
if
__name__
==
'__main__'
:
test_zero
()
tests/test_zero_data_parallel/test_zero.sh
0 → 100644
View file @
404ecbdc
#!/bin/bash
test_file
=
"test_zero.py"
python
$test_file
--local_rank
$SLURM_PROCID
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
\ No newline at end of file
tests/test_zero_tensor_parallel/configs/vit_2d_zero1.py
0 → 100644
View file @
404ecbdc
import
os
from
pathlib
import
Path
import
torch
BATCH_SIZE
=
512
IMG_SIZE
=
32
PATCH_SIZE
=
4
DIM
=
512
NUM_ATTENTION_HEADS
=
8
SUMMA_DIM
=
2
NUM_CLASSES
=
10
DEPTH
=
6
train_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
transform_pipeline
=
[
dict
(
type
=
'RandomCrop'
,
size
=
IMG_SIZE
,
padding
=
4
),
dict
(
type
=
'RandomHorizontalFlip'
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
4
,
shuffle
=
True
)
)
test_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
train
=
False
,
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
IMG_SIZE
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]
),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
4
,
shuffle
=
True
)
)
optimizer
=
dict
(
type
=
'ZeroRedundancyOptimizer'
,
optimizer_class
=
torch
.
optim
.
Adam
,
lr
=
0.001
,
weight_decay
=
0
)
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.001
,
weight_decay
=
0
)
loss
=
dict
(
type
=
'CrossEntropyLoss2D'
,
)
model
=
dict
(
type
=
'VisionTransformerFromConfig'
,
tensor_splitting_cfg
=
dict
(
type
=
'ViTInputSplitter2D'
,
),
embedding_cfg
=
dict
(
type
=
'ViTPatchEmbedding2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
),
token_fusion_cfg
=
dict
(
type
=
'ViTTokenFuser2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
drop_rate
=
0.1
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
block_cfg
=
dict
(
type
=
'ViTBlock'
,
attention_cfg
=
dict
(
type
=
'ViTSelfAttention2D'
,
hidden_size
=
DIM
,
num_attention_heads
=
NUM_ATTENTION_HEADS
,
attention_dropout_prob
=
0.
,
hidden_dropout_prob
=
0.1
,
),
droppath_cfg
=
dict
(
type
=
'VanillaViTDropPath'
,
),
mlp_cfg
=
dict
(
type
=
'ViTMLP2D'
,
in_features
=
DIM
,
dropout_prob
=
0.1
,
mlp_ratio
=
1
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
),
head_cfg
=
dict
(
type
=
'ViTHead2D'
,
hidden_size
=
DIM
,
num_classes
=
NUM_CLASSES
,
),
embed_dim
=
DIM
,
depth
=
DEPTH
,
drop_path_rate
=
0.
,
)
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
from
colossalai.engine
import
AMP_TYPE
fp16
=
dict
(
mode
=
AMP_TYPE
.
PARALLEL
,
initial_scale
=
2
**
4
)
#
# fp16 = dict(
# mode=None,
# )
# both level 2 and 3 work
# zero = dict(
# type='ZeroRedundancyOptimizer_Level_1',
# )
lr_scheduler
=
dict
(
type
=
'LinearWarmupLR'
,
warmup_epochs
=
5
)
num_epochs
=
60
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
0 → 100644
View file @
404ecbdc
import
os
from
pathlib
import
Path
BATCH_SIZE
=
512
IMG_SIZE
=
32
PATCH_SIZE
=
4
DIM
=
512
NUM_ATTENTION_HEADS
=
8
SUMMA_DIM
=
2
NUM_CLASSES
=
10
DEPTH
=
6
train_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
transform_pipeline
=
[
dict
(
type
=
'RandomCrop'
,
size
=
IMG_SIZE
,
padding
=
4
),
dict
(
type
=
'RandomHorizontalFlip'
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
4
,
shuffle
=
True
)
)
test_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
train
=
False
,
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
IMG_SIZE
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]
),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
4
,
shuffle
=
True
)
)
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.001
,
weight_decay
=
0
)
loss
=
dict
(
type
=
'CrossEntropyLoss2D'
,
)
model
=
dict
(
type
=
'VisionTransformerFromConfig'
,
tensor_splitting_cfg
=
dict
(
type
=
'ViTInputSplitter2D'
,
),
embedding_cfg
=
dict
(
type
=
'ViTPatchEmbedding2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
),
token_fusion_cfg
=
dict
(
type
=
'ViTTokenFuser2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
drop_rate
=
0.1
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
block_cfg
=
dict
(
type
=
'ViTBlock'
,
attention_cfg
=
dict
(
type
=
'ViTSelfAttention2D'
,
hidden_size
=
DIM
,
num_attention_heads
=
NUM_ATTENTION_HEADS
,
attention_dropout_prob
=
0.
,
hidden_dropout_prob
=
0.1
,
),
droppath_cfg
=
dict
(
type
=
'VanillaViTDropPath'
,
),
mlp_cfg
=
dict
(
type
=
'ViTMLP2D'
,
in_features
=
DIM
,
dropout_prob
=
0.1
,
mlp_ratio
=
1
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
),
head_cfg
=
dict
(
type
=
'ViTHead2D'
,
hidden_size
=
DIM
,
num_classes
=
NUM_CLASSES
,
),
embed_dim
=
DIM
,
depth
=
DEPTH
,
drop_path_rate
=
0.
,
)
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
# from colossalai.engine import AMP_TYPE
#
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 4
# )
fp16
=
dict
(
mode
=
None
,
)
# both level 2 and 3 work
zero
=
dict
(
type
=
'ZeroRedundancyOptimizer_Level_2'
)
lr_scheduler
=
dict
(
type
=
'LinearWarmupLR'
,
warmup_epochs
=
5
)
num_epochs
=
60
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
0 → 100644
View file @
404ecbdc
import
os
from
pathlib
import
Path
BATCH_SIZE
=
512
IMG_SIZE
=
32
PATCH_SIZE
=
4
DIM
=
512
NUM_ATTENTION_HEADS
=
8
SUMMA_DIM
=
2
NUM_CLASSES
=
10
DEPTH
=
6
train_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
transform_pipeline
=
[
dict
(
type
=
'RandomCrop'
,
size
=
IMG_SIZE
,
padding
=
4
),
dict
(
type
=
'RandomHorizontalFlip'
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
4
,
shuffle
=
True
)
)
test_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
train
=
False
,
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
IMG_SIZE
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]
),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
4
,
shuffle
=
True
)
)
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.001
,
weight_decay
=
0
)
loss
=
dict
(
type
=
'CrossEntropyLoss2D'
,
)
model
=
dict
(
type
=
'VisionTransformerFromConfig'
,
tensor_splitting_cfg
=
dict
(
type
=
'ViTInputSplitter2D'
,
),
embedding_cfg
=
dict
(
type
=
'ViTPatchEmbedding2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
),
token_fusion_cfg
=
dict
(
type
=
'ViTTokenFuser2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
drop_rate
=
0.1
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
block_cfg
=
dict
(
type
=
'ViTBlock'
,
attention_cfg
=
dict
(
type
=
'ViTSelfAttention2D'
,
hidden_size
=
DIM
,
num_attention_heads
=
NUM_ATTENTION_HEADS
,
attention_dropout_prob
=
0.
,
hidden_dropout_prob
=
0.1
,
),
droppath_cfg
=
dict
(
type
=
'VanillaViTDropPath'
,
),
mlp_cfg
=
dict
(
type
=
'ViTMLP2D'
,
in_features
=
DIM
,
dropout_prob
=
0.1
,
mlp_ratio
=
1
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
),
head_cfg
=
dict
(
type
=
'ViTHead2D'
,
hidden_size
=
DIM
,
num_classes
=
NUM_CLASSES
,
),
embed_dim
=
DIM
,
depth
=
DEPTH
,
drop_path_rate
=
0.
,
)
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
# from colossalai.engine import AMP_TYPE
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 4
# )
fp16
=
dict
(
mode
=
None
,
)
# both level 2 and 3 work
zero
=
dict
(
type
=
'ZeroRedundancyOptimizer_Level_3'
)
lr_scheduler
=
dict
(
type
=
'LinearWarmupLR'
,
warmup_epochs
=
5
)
num_epochs
=
60
tests/test_zero_tensor_parallel/test.sh
0 → 100644
View file @
404ecbdc
#!/usr/bin/env sh
test_file
=
$1
python
$test_file
--local_rank
$SLURM_PROCID
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
\ No newline at end of file
tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
from
pathlib
import
Path
import
pytest
import
torch.autograd
import
colossalai
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.engine
import
Engine
from
colossalai.logging
import
get_global_dist_logger
from
colossalai.nn.layer._parallel_utilities
import
_gather
level
=
os
.
environ
[
'LEVEL'
]
CONFIG_PATH
=
Path
(
__file__
).
parent
.
parent
.
joinpath
(
f
'configs/vit_2d_zero
{
level
}
.py'
)
def
eval
(
engine
):
engine
.
eval
()
accumulated_loss
=
0
correct_sum
=
0
total_sum
=
0
for
i
in
range
(
engine
.
schedule
.
num_steps
):
output
,
label
,
loss
=
engine
.
step
()
accumulated_loss
+=
loss
.
detach
().
cpu
().
numpy
()
output
=
_gather
(
output
[
0
],
ParallelMode
.
PARALLEL_2D_ROW
,
1
)
output
=
_gather
(
output
,
ParallelMode
.
PARALLEL_2D_COL
,
0
,
)
output
=
torch
.
argmax
(
output
,
dim
=-
1
)
correct
=
torch
.
sum
(
label
[
0
]
==
output
)
correct_sum
+=
correct
total_sum
+=
label
[
0
].
size
(
0
)
avg_loss
=
accumulated_loss
/
engine
.
schedule
.
num_steps
return
correct_sum
,
total_sum
,
avg_loss
def
train
(
engine
):
engine
.
train
()
accumulated_loss
=
0
for
i
in
range
(
engine
.
schedule
.
num_steps
):
output
,
label
,
loss
=
engine
.
step
()
accumulated_loss
+=
loss
.
detach
().
cpu
().
numpy
()
avg_loss
=
accumulated_loss
/
engine
.
schedule
.
num_steps
return
avg_loss
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test should be invoked by test.sh in the same folder as it runs on multiple gpus"
)
def
test_2d_parallel_vision_transformer
():
# init dist
model
,
train_dataloader
,
test_dataloader
,
criterion
,
optimizer
,
schedule
,
lr_scheduler
=
colossalai
.
initialize
(
CONFIG_PATH
)
logger
=
get_global_dist_logger
()
engine
=
Engine
(
model
=
model
,
train_dataloader
=
train_dataloader
,
test_dataloader
=
test_dataloader
,
criterion
=
criterion
,
optimizer
=
optimizer
,
lr_scheduler
=
lr_scheduler
,
schedule
=
schedule
)
# for param in model.parameters():
# if isinstance(param, torch.HalfTensor):
# print(param.shape)
logger
.
info
(
'start training'
)
for
epoch
in
range
(
gpc
.
config
.
num_epochs
):
train_loss
=
train
(
engine
)
logger
.
info
(
f
'epoch
{
epoch
}
- train loss:
{
train_loss
}
'
)
if
epoch
%
2
==
0
:
correct_sum
,
total_sum
,
eval_loss
=
eval
(
engine
)
logger
.
info
(
f
'epoch
{
epoch
}
- eval loss:
{
eval_loss
}
, total:
{
total_sum
}
, '
f
'correct:
{
correct_sum
}
, acc:
{
correct_sum
/
total_sum
}
'
)
if
__name__
==
'__main__'
:
test_2d_parallel_vision_transformer
()
Prev
1
…
17
18
19
20
21
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment