Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
cd9c28e0
Unverified
Commit
cd9c28e0
authored
Dec 16, 2021
by
Frank Lee
Committed by
GitHub
Dec 16, 2021
Browse files
added CI for unit testing (#69)
parent
45355a62
Changes
68
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
131 additions
and
92 deletions
+131
-92
.github/workflows/build.yml
.github/workflows/build.yml
+40
-0
colossalai/trainer/hooks/_log_hook.py
colossalai/trainer/hooks/_log_hook.py
+1
-1
tests/test_context/test_2d_init.py
tests/test_context/test_2d_init.py
+5
-4
tests/test_context/test_2p5d_init.py
tests/test_context/test_2p5d_init.py
+3
-1
tests/test_context/test_3d_init.py
tests/test_context/test_3d_init.py
+4
-1
tests/test_data/test_data_parallel_sampler.py
tests/test_data/test_data_parallel_sampler.py
+3
-2
tests/test_data/test_deterministic_dataloader.py
tests/test_data/test_deterministic_dataloader.py
+3
-3
tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py
...peline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py
+4
-1
tests/test_engine/test.sh
tests/test_engine/test.sh
+0
-4
tests/test_engine/test_engine/test_engine_apex_amp.py
tests/test_engine/test_engine/test_engine_apex_amp.py
+12
-14
tests/test_engine/test_engine/test_engine_naive_amp.py
tests/test_engine/test_engine/test_engine_naive_amp.py
+12
-13
tests/test_engine/test_engine/test_engine_no_amp.py
tests/test_engine/test_engine/test_engine_no_amp.py
+12
-13
tests/test_engine/test_engine/test_engine_torch_amp.py
tests/test_engine/test_engine/test_engine_torch_amp.py
+12
-13
tests/test_layers/test.sh
tests/test_layers/test.sh
+0
-4
tests/test_layers/test_1d/checks_1d/__init__.py
tests/test_layers/test_1d/checks_1d/__init__.py
+0
-0
tests/test_layers/test_1d/checks_1d/check_layer_1d.py
tests/test_layers/test_1d/checks_1d/check_layer_1d.py
+1
-2
tests/test_layers/test_1d/checks_1d/common.py
tests/test_layers/test_1d/checks_1d/common.py
+0
-0
tests/test_layers/test_1d/test_1d.py
tests/test_layers/test_1d/test_1d.py
+18
-15
tests/test_layers/test_2d/checks_2d/__init__.py
tests/test_layers/test_2d/checks_2d/__init__.py
+0
-0
tests/test_layers/test_2d/checks_2d/check_layer_2d.py
tests/test_layers/test_2d/checks_2d/check_layer_2d.py
+1
-1
No files found.
.github/workflows/build.yml
0 → 100644
View file @
cd9c28e0
name
:
Build
on
:
pull_request
:
types
:
[
review_requested
]
branches
:
-
"
*"
jobs
:
build
:
name
:
Build and test Colossal-AI
runs-on
:
[
self-hosted
,
gpu
]
container
:
image
:
nvcr.io/nvidia/pytorch:21.07-py3
options
:
--gpus all --rm --ipc=host -v /data/cifar-10:/data/cifar-10
timeout-minutes
:
1200
if
:
github.event.pull_request.draft ==
false
&& github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
steps
:
-
name
:
Setup Environment
run
:
|
export https_proxy=http://172.17.0.1:7890 http_proxy=http://172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890
-
name
:
Install dependencies
run
:
|
python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install -U pip setuptools wheel --user
pip install pytest tensorboard deepspeed apex
-
uses
:
actions/checkout@v2
-
name
:
Install Colossal-AI
run
:
|
pip install -v --no-cache-dir --global-option="--cuda_ext" .
-
name
:
Unit Testing
run
:
|
pytest tests
env
:
DATA
:
/data/cifar-10
colossalai/trainer/hooks/_log_hook.py
View file @
cd9c28e0
...
@@ -5,7 +5,6 @@ import os
...
@@ -5,7 +5,6 @@ import os
import
os.path
as
osp
import
os.path
as
osp
import
torch
import
torch
from
torch.utils.tensorboard
import
SummaryWriter
from
typing
import
List
from
typing
import
List
from
decimal
import
Decimal
from
decimal
import
Decimal
from
colossalai.context
import
ParallelMode
from
colossalai.context
import
ParallelMode
...
@@ -100,6 +99,7 @@ class TensorboardHook(BaseHook):
...
@@ -100,6 +99,7 @@ class TensorboardHook(BaseHook):
priority
:
int
=
10
,
priority
:
int
=
10
,
)
->
None
:
)
->
None
:
super
().
__init__
(
priority
=
priority
)
super
().
__init__
(
priority
=
priority
)
from
torch.utils.tensorboard
import
SummaryWriter
# create log dir
# create log dir
if
not
gpc
.
is_initialized
(
ParallelMode
.
GLOBAL
)
or
gpc
.
get_global_rank
()
==
0
:
if
not
gpc
.
is_initialized
(
ParallelMode
.
GLOBAL
)
or
gpc
.
get_global_rank
()
==
0
:
...
...
tests/test_context/test_2d_init.py
View file @
cd9c28e0
#!/usr/bin/env python
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# -*- encoding: utf-8 -*-
from
functools
import
partial
from
pathlib
import
Path
import
pytest
import
pytest
import
torch
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
from
colossalai
import
launch
from
colossalai
import
launch
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
functools
import
partial
from
pathlib
import
Path
CONFIG_PATH
=
Path
(
__file__
).
parent
.
joinpath
(
'configs/parallel_2d_init.py'
).
absolute
()
CONFIG_PATH
=
Path
(
__file__
).
parent
.
joinpath
(
'configs/parallel_2d_init.py'
).
absolute
()
...
@@ -75,6 +75,7 @@ def init_2d(rank, world_size, backend, port, host):
...
@@ -75,6 +75,7 @@ def init_2d(rank, world_size, backend, port, host):
check_2d_parallel_rank
(
rank
)
check_2d_parallel_rank
(
rank
)
check_pipeline_parallel_rank
(
rank
)
check_pipeline_parallel_rank
(
rank
)
gpc
.
destroy
()
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
cpu
@
pytest
.
mark
.
cpu
...
@@ -86,7 +87,7 @@ def test_2d_init():
...
@@ -86,7 +87,7 @@ def test_2d_init():
test_fn
=
partial
(
init_2d
,
test_fn
=
partial
(
init_2d
,
world_size
=
world_size
,
world_size
=
world_size
,
backend
=
'gloo'
,
backend
=
'gloo'
,
port
=
'29
5
00'
,
port
=
'29
9
00'
,
host
=
'localhost'
host
=
'localhost'
)
)
mp
.
spawn
(
test_fn
,
nprocs
=
world_size
)
mp
.
spawn
(
test_fn
,
nprocs
=
world_size
)
...
...
tests/test_context/test_2p5d_init.py
View file @
cd9c28e0
...
@@ -5,6 +5,7 @@ from functools import partial
...
@@ -5,6 +5,7 @@ from functools import partial
from
pathlib
import
Path
from
pathlib
import
Path
import
pytest
import
pytest
import
torch
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.context.parallel_mode
import
ParallelMode
...
@@ -98,6 +99,7 @@ def init_2halfd(rank, world_size, backend, port, host):
...
@@ -98,6 +99,7 @@ def init_2halfd(rank, world_size, backend, port, host):
check_tensor_parallel_rank
(
rank
)
check_tensor_parallel_rank
(
rank
)
check_2p5d_parallel_rank
(
rank
)
check_2p5d_parallel_rank
(
rank
)
gpc
.
destroy
()
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
cpu
@
pytest
.
mark
.
cpu
...
@@ -109,7 +111,7 @@ def test_2halfd_init():
...
@@ -109,7 +111,7 @@ def test_2halfd_init():
test_fn
=
partial
(
init_2halfd
,
test_fn
=
partial
(
init_2halfd
,
world_size
=
world_size
,
world_size
=
world_size
,
backend
=
'gloo'
,
backend
=
'gloo'
,
port
=
'29
5
01'
,
port
=
'29
9
01'
,
host
=
'localhost'
host
=
'localhost'
)
)
mp
.
spawn
(
test_fn
,
nprocs
=
world_size
)
mp
.
spawn
(
test_fn
,
nprocs
=
world_size
)
...
...
tests/test_context/test_3d_init.py
View file @
cd9c28e0
...
@@ -5,8 +5,10 @@ from functools import partial
...
@@ -5,8 +5,10 @@ from functools import partial
from
pathlib
import
Path
from
pathlib
import
Path
import
pytest
import
pytest
import
torch
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
colossalai.initialize
import
launch
from
colossalai.initialize
import
launch
...
@@ -90,6 +92,7 @@ def init_3d(rank, world_size, backend, port, host):
...
@@ -90,6 +92,7 @@ def init_3d(rank, world_size, backend, port, host):
check_data_parallel_rank
(
rank
)
check_data_parallel_rank
(
rank
)
check_pipeline_parallel_rank
(
rank
)
check_pipeline_parallel_rank
(
rank
)
gpc
.
destroy
()
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
cpu
@
pytest
.
mark
.
cpu
...
@@ -101,7 +104,7 @@ def test_3d_init():
...
@@ -101,7 +104,7 @@ def test_3d_init():
test_fn
=
partial
(
init_3d
,
test_fn
=
partial
(
init_3d
,
world_size
=
world_size
,
world_size
=
world_size
,
backend
=
'gloo'
,
backend
=
'gloo'
,
port
=
'29
5
02'
,
port
=
'29
9
02'
,
host
=
'localhost'
host
=
'localhost'
)
)
mp
.
spawn
(
test_fn
,
nprocs
=
world_size
)
mp
.
spawn
(
test_fn
,
nprocs
=
world_size
)
...
...
tests/test_data/test_data_parallel_sampler.py
View file @
cd9c28e0
...
@@ -6,7 +6,7 @@ from functools import partial
...
@@ -6,7 +6,7 @@ from functools import partial
from
pathlib
import
Path
from
pathlib
import
Path
import
pytest
import
pytest
import
torch
.cuda
import
torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
from
torch.utils.data
import
DataLoader
from
torch.utils.data
import
DataLoader
...
@@ -49,7 +49,7 @@ def run_data_sampler(rank, world_size):
...
@@ -49,7 +49,7 @@ def run_data_sampler(rank, world_size):
rank
=
rank
,
rank
=
rank
,
world_size
=
world_size
,
world_size
=
world_size
,
backend
=
'gloo'
,
backend
=
'gloo'
,
port
=
'29
5
03'
,
port
=
'29
9
03'
,
host
=
'localhost'
host
=
'localhost'
)
)
colossalai
.
launch
(
**
dist_args
)
colossalai
.
launch
(
**
dist_args
)
...
@@ -73,6 +73,7 @@ def run_data_sampler(rank, world_size):
...
@@ -73,6 +73,7 @@ def run_data_sampler(rank, world_size):
if
gpc
.
get_local_rank
(
ParallelMode
.
DATA
)
!=
0
:
if
gpc
.
get_local_rank
(
ParallelMode
.
DATA
)
!=
0
:
assert
not
torch
.
equal
(
img
,
assert
not
torch
.
equal
(
img
,
img_to_compare
),
'Same image was distributed across ranks but expected it to be different'
img_to_compare
),
'Same image was distributed across ranks but expected it to be different'
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
cpu
@
pytest
.
mark
.
cpu
...
...
tests/test_data/test_deterministic_dataloader.py
View file @
cd9c28e0
...
@@ -6,7 +6,7 @@ from functools import partial
...
@@ -6,7 +6,7 @@ from functools import partial
from
pathlib
import
Path
from
pathlib
import
Path
import
pytest
import
pytest
import
torch
.cuda
import
torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
from
torchvision
import
transforms
from
torchvision
import
transforms
...
@@ -52,11 +52,10 @@ def run_data_sampler(rank, world_size):
...
@@ -52,11 +52,10 @@ def run_data_sampler(rank, world_size):
rank
=
rank
,
rank
=
rank
,
world_size
=
world_size
,
world_size
=
world_size
,
backend
=
'gloo'
,
backend
=
'gloo'
,
port
=
'29
499
'
,
port
=
'29
904
'
,
host
=
'localhost'
host
=
'localhost'
)
)
colossalai
.
launch
(
**
dist_args
)
colossalai
.
launch
(
**
dist_args
)
print
(
'finished initialization'
)
dataset_cfg
=
gpc
.
config
.
train_data
.
dataset
dataset_cfg
=
gpc
.
config
.
train_data
.
dataset
dataloader_cfg
=
gpc
.
config
.
train_data
.
dataloader
dataloader_cfg
=
gpc
.
config
.
train_data
.
dataloader
...
@@ -88,6 +87,7 @@ def run_data_sampler(rank, world_size):
...
@@ -88,6 +87,7 @@ def run_data_sampler(rank, world_size):
# this should be false if data parallel sampler to given to the dataloader
# this should be false if data parallel sampler to given to the dataloader
assert
torch
.
equal
(
img
,
assert
torch
.
equal
(
img
,
img_to_compare
),
'Same image was distributed across ranks and expected it to be the same'
img_to_compare
),
'Same image was distributed across ranks and expected it to be the same'
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
cpu
@
pytest
.
mark
.
cpu
...
...
tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py
View file @
cd9c28e0
import
pytest
from
pathlib
import
Path
from
pathlib
import
Path
from
colossalai.amp.amp_type
import
AMP_TYPE
from
colossalai.amp.amp_type
import
AMP_TYPE
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.context.parallel_mode
import
ParallelMode
...
@@ -34,7 +35,9 @@ CONFIG = dict(
...
@@ -34,7 +35,9 @@ CONFIG = dict(
)
)
def
main
():
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test requires more than 8 GPUs, you should invoke this test script using test.sh provided manually"
)
def
test_hybrid_parallel
():
parser
=
colossalai
.
get_default_parser
()
parser
=
colossalai
.
get_default_parser
()
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
colossalai
.
launch_from_slurm
(
config
=
CONFIG
,
colossalai
.
launch_from_slurm
(
config
=
CONFIG
,
...
...
tests/test_engine/test.sh
deleted
100644 → 0
View file @
45355a62
#!/usr/bin/env sh
test_file
=
$1
python
$test_file
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
--rank
$SLURM_PROCID
\ No newline at end of file
tests/test_engine/test_engine/test_engine_apex_amp.py
View file @
cd9c28e0
...
@@ -8,6 +8,7 @@ import torch
...
@@ -8,6 +8,7 @@ import torch
import
os.path
as
osp
import
os.path
as
osp
from
pathlib
import
Path
from
pathlib
import
Path
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.multiprocessing
as
mp
from
torchvision
import
transforms
from
torchvision
import
transforms
from
torch.optim
import
Adam
from
torch.optim
import
Adam
...
@@ -15,9 +16,9 @@ from colossalai.core import global_context as gpc
...
@@ -15,9 +16,9 @@ from colossalai.core import global_context as gpc
from
colossalai.amp
import
AMP_TYPE
from
colossalai.amp
import
AMP_TYPE
from
colossalai.logging
import
get_dist_logger
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils
import
report_memory_usage
,
get_dataloader
from
colossalai.utils
import
report_memory_usage
,
get_dataloader
from
colossalai.initialize
import
get_default_parser
from
torchvision.models
import
resnet18
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
# Config
# Config
...
@@ -37,18 +38,15 @@ CONFIG = dict(
...
@@ -37,18 +38,15 @@ CONFIG = dict(
)
)
def
run_no_pipeline
():
def
run_engine
(
rank
,
world_size
):
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
# init dist env
# init dist env
colossalai
.
launch
(
colossalai
.
launch
(
config
=
CONFIG
,
config
=
CONFIG
,
rank
=
args
.
rank
,
rank
=
rank
,
world_size
=
args
.
world_size
,
world_size
=
world_size
,
host
=
args
.
host
,
host
=
'local
host
'
,
port
=
args
.
port
,
port
=
29910
,
backend
=
args
.
backend
backend
=
'nccl'
)
)
# build model
# build model
...
@@ -69,8 +67,6 @@ def run_no_pipeline():
...
@@ -69,8 +67,6 @@ def run_no_pipeline():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
drop_last
=
True
)
drop_last
=
True
)
# build optimizer
# build optimizer
...
@@ -102,12 +98,14 @@ def run_no_pipeline():
...
@@ -102,12 +98,14 @@ def run_no_pipeline():
gpc
.
destroy
()
gpc
.
destroy
()
logger
.
info
(
'Test engine finished'
)
logger
.
info
(
'Test engine finished'
)
report_memory_usage
(
"After testing"
)
report_memory_usage
(
"After testing"
)
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
skip
(
"This test should be invoked using the test.sh provided"
)
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
def
test_engine
():
def
test_engine
():
run_no_pipeline
()
world_size
=
4
run_func
=
partial
(
run_engine
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
tests/test_engine/test_engine/test_engine_naive_amp.py
View file @
cd9c28e0
...
@@ -5,6 +5,7 @@ import torch
...
@@ -5,6 +5,7 @@ import torch
import
os.path
as
osp
import
os.path
as
osp
from
pathlib
import
Path
from
pathlib
import
Path
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.multiprocessing
as
mp
from
torchvision
import
transforms
from
torchvision
import
transforms
from
torch.optim
import
Adam
from
torch.optim
import
Adam
...
@@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
...
@@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
from
colossalai.initialize
import
get_default_parser
from
colossalai.initialize
import
get_default_parser
from
torchvision.models
import
resnet18
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
# Config
# Config
...
@@ -36,18 +38,15 @@ CONFIG = dict(
...
@@ -36,18 +38,15 @@ CONFIG = dict(
)
)
def
run_no_pipeline
():
def
run_engine
(
rank
,
world_size
):
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
# init dist env
# init dist env
colossalai
.
launch
(
colossalai
.
launch
(
config
=
CONFIG
,
config
=
CONFIG
,
rank
=
args
.
rank
,
rank
=
rank
,
world_size
=
args
.
world_size
,
world_size
=
world_size
,
host
=
args
.
host
,
host
=
'local
host
'
,
port
=
args
.
port
,
port
=
29911
,
backend
=
args
.
backend
backend
=
'nccl'
)
)
# build model
# build model
...
@@ -68,8 +67,6 @@ def run_no_pipeline():
...
@@ -68,8 +67,6 @@ def run_no_pipeline():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
drop_last
=
True
)
drop_last
=
True
)
# build optimizer
# build optimizer
...
@@ -101,12 +98,14 @@ def run_no_pipeline():
...
@@ -101,12 +98,14 @@ def run_no_pipeline():
gpc
.
destroy
()
gpc
.
destroy
()
logger
.
info
(
'Test engine finished'
)
logger
.
info
(
'Test engine finished'
)
report_memory_usage
(
"After testing"
)
report_memory_usage
(
"After testing"
)
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
skip
(
"This test should be invoked using the test.sh provided"
)
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
def
test_engine
():
def
test_engine
():
run_no_pipeline
()
world_size
=
4
run_func
=
partial
(
run_engine
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
tests/test_engine/test_engine/test_engine_no_amp.py
View file @
cd9c28e0
...
@@ -5,6 +5,7 @@ import torch
...
@@ -5,6 +5,7 @@ import torch
import
os.path
as
osp
import
os.path
as
osp
from
pathlib
import
Path
from
pathlib
import
Path
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.multiprocessing
as
mp
from
torchvision
import
transforms
from
torchvision
import
transforms
from
torch.optim
import
Adam
from
torch.optim
import
Adam
...
@@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
...
@@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
from
colossalai.initialize
import
get_default_parser
from
colossalai.initialize
import
get_default_parser
from
torchvision.models
import
resnet18
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
# Config
# Config
...
@@ -33,18 +35,15 @@ CONFIG = dict(
...
@@ -33,18 +35,15 @@ CONFIG = dict(
)
)
def
run_no_pipeline
():
def
run_engine
(
rank
,
world_size
):
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
# init dist env
# init dist env
colossalai
.
launch
(
colossalai
.
launch
(
config
=
CONFIG
,
config
=
CONFIG
,
rank
=
args
.
rank
,
rank
=
rank
,
world_size
=
args
.
world_size
,
world_size
=
world_size
,
host
=
args
.
host
,
host
=
'local
host
'
,
port
=
args
.
port
,
port
=
29912
,
backend
=
args
.
backend
backend
=
'nccl'
)
)
# build model
# build model
...
@@ -65,8 +64,6 @@ def run_no_pipeline():
...
@@ -65,8 +64,6 @@ def run_no_pipeline():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
drop_last
=
True
)
drop_last
=
True
)
# build optimizer
# build optimizer
...
@@ -98,12 +95,14 @@ def run_no_pipeline():
...
@@ -98,12 +95,14 @@ def run_no_pipeline():
gpc
.
destroy
()
gpc
.
destroy
()
logger
.
info
(
'Test engine finished'
)
logger
.
info
(
'Test engine finished'
)
report_memory_usage
(
"After testing"
)
report_memory_usage
(
"After testing"
)
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
skip
(
"This test should be invoked using the test.sh provided"
)
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
def
test_engine
():
def
test_engine
():
run_no_pipeline
()
world_size
=
4
run_func
=
partial
(
run_engine
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
tests/test_engine/test_engine/test_engine_torch_amp.py
View file @
cd9c28e0
...
@@ -5,6 +5,7 @@ import torch
...
@@ -5,6 +5,7 @@ import torch
import
os.path
as
osp
import
os.path
as
osp
from
pathlib
import
Path
from
pathlib
import
Path
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.multiprocessing
as
mp
from
torchvision
import
transforms
from
torchvision
import
transforms
from
torch.optim
import
Adam
from
torch.optim
import
Adam
...
@@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
...
@@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
from
colossalai.initialize
import
get_default_parser
from
colossalai.initialize
import
get_default_parser
from
torchvision.models
import
resnet18
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
# Config
# Config
...
@@ -34,18 +36,15 @@ CONFIG = dict(
...
@@ -34,18 +36,15 @@ CONFIG = dict(
)
)
def
run_no_pipeline
():
def
run_engine
(
rank
,
world_size
):
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
# init dist env
# init dist env
colossalai
.
launch
(
colossalai
.
launch
(
config
=
CONFIG
,
config
=
CONFIG
,
rank
=
args
.
rank
,
rank
=
rank
,
world_size
=
args
.
world_size
,
world_size
=
world_size
,
host
=
args
.
host
,
host
=
'local
host
'
,
port
=
args
.
port
,
port
=
29913
,
backend
=
args
.
backend
backend
=
'nccl'
)
)
# build model
# build model
...
@@ -66,8 +65,6 @@ def run_no_pipeline():
...
@@ -66,8 +65,6 @@ def run_no_pipeline():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
drop_last
=
True
)
drop_last
=
True
)
# build optimizer
# build optimizer
...
@@ -99,12 +96,14 @@ def run_no_pipeline():
...
@@ -99,12 +96,14 @@ def run_no_pipeline():
gpc
.
destroy
()
gpc
.
destroy
()
logger
.
info
(
'Test engine finished'
)
logger
.
info
(
'Test engine finished'
)
report_memory_usage
(
"After testing"
)
report_memory_usage
(
"After testing"
)
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
skip
(
"This test should be invoked using the test.sh provided"
)
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
def
test_engine
():
def
test_engine
():
run_no_pipeline
()
world_size
=
4
run_func
=
partial
(
run_engine
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
tests/test_layers/test.sh
deleted
100644 → 0
View file @
45355a62
#!/usr/bin/env sh
test_file
=
$1
python
$test_file
--rank
$SLURM_PROCID
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
\ No newline at end of file
tests/test_layers/test_1d/checks_1d/__init__.py
0 → 100644
View file @
cd9c28e0
tests/test_layers/test_1d/
test
_layer.py
→
tests/test_layers/test_1d/
checks_1d/check
_layer
_1d
.py
View file @
cd9c28e0
from
tests.test_layers.test_3d.common
import
IMG_SIZE
import
torch
import
torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
from
torch.nn
import
Parameter
from
torch.nn
import
Parameter
...
@@ -7,7 +6,7 @@ from colossalai.context.parallel_mode import ParallelMode
...
@@ -7,7 +6,7 @@ from colossalai.context.parallel_mode import ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
colossalai.nn
import
Linear1D_Col
,
Linear1D_Row
,
TransformerMLP1D
,
TransformerSelfAttention1D
,
ViTMLP1D
,
ViTSelfAttention1D
,
ViTPatchEmbedding1D
,
ViTHead1D
,
ViTTokenFuser1D
from
colossalai.nn
import
Linear1D_Col
,
Linear1D_Row
,
TransformerMLP1D
,
TransformerSelfAttention1D
,
ViTMLP1D
,
ViTSelfAttention1D
,
ViTPatchEmbedding1D
,
ViTHead1D
,
ViTTokenFuser1D
from
colossalai.utils
import
get_current_device
,
print_rank_0
from
colossalai.utils
import
get_current_device
,
print_rank_0
from
common
import
HIDDEN_SIZE
,
DEPTH
,
BATCH_SIZE
,
SEQ_LENGTH
,
NUM_CLASSES
,
check_equal
,
IMG_SIZE
from
.
common
import
HIDDEN_SIZE
,
DEPTH
,
BATCH_SIZE
,
SEQ_LENGTH
,
NUM_CLASSES
,
check_equal
,
IMG_SIZE
def
check_linear_col
():
def
check_linear_col
():
...
...
tests/test_layers/test_1d/common.py
→
tests/test_layers/test_1d/
checks_1d/
common.py
View file @
cd9c28e0
File moved
tests/test_layers/test_1d/test_1d.py
View file @
cd9c28e0
...
@@ -2,10 +2,13 @@
...
@@ -2,10 +2,13 @@
# -*- encoding: utf-8 -*-
# -*- encoding: utf-8 -*-
import
pytest
import
pytest
import
torch
import
torch.multiprocessing
as
mp
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
colossalai.initialize
import
launch
,
get_default_parser
from
colossalai.initialize
import
launch
,
get_default_parser
from
test_layer
import
*
from
functools
import
partial
from
checks_1d.check_layer_1d
import
*
CONFIG
=
dict
(
CONFIG
=
dict
(
parallel
=
dict
(
parallel
=
dict
(
...
@@ -18,8 +21,14 @@ CONFIG = dict(
...
@@ -18,8 +21,14 @@ CONFIG = dict(
)
)
def
check_layer
():
def
check_layer
(
rank
,
world_size
):
# print_rank_0('start check_linear_col')
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
29920
,
backend
=
'nccl'
)
check_linear_col
()
check_linear_col
()
check_linear_row
()
check_linear_row
()
check_attention
()
check_attention
()
...
@@ -28,21 +37,15 @@ def check_layer():
...
@@ -28,21 +37,15 @@ def check_layer():
check_embed
()
check_embed
()
check_head
()
check_head
()
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test should be invoked by test.sh in the same folder as it runs on multiple gpus"
)
def
test_1d
():
def
test_1d
():
parser
=
get_default_parser
()
world_size
=
2
args
=
parser
.
parse_args
()
run_func
=
partial
(
check_layer
,
world_size
=
world_size
)
launch
(
config
=
CONFIG
,
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
rank
=
args
.
rank
,
world_size
=
args
.
world_size
,
host
=
args
.
host
,
port
=
args
.
port
,
backend
=
args
.
backend
)
check_layer
()
gpc
.
destroy
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
tests/test_layers/test_2d/checks_2d/__init__.py
0 → 100644
View file @
cd9c28e0
tests/test_layers/test_2d/
test
_layer.py
→
tests/test_layers/test_2d/
checks_2d/check
_layer
_2d
.py
View file @
cd9c28e0
...
@@ -5,7 +5,7 @@ from colossalai.context.parallel_mode import ParallelMode
...
@@ -5,7 +5,7 @@ from colossalai.context.parallel_mode import ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
colossalai.nn
import
Linear2D
,
LayerNorm2D
,
TransformerSelfAttention2D
,
TransformerMLP2D
,
TransformerLayer2D
from
colossalai.nn
import
Linear2D
,
LayerNorm2D
,
TransformerSelfAttention2D
,
TransformerMLP2D
,
TransformerLayer2D
from
colossalai.utils
import
get_current_device
,
print_rank_0
from
colossalai.utils
import
get_current_device
,
print_rank_0
from
common
import
HIDDEN_SIZE
,
DEPTH
,
BATCH_SIZE
,
SEQ_LENGTH
,
check_equal
from
.
common
import
HIDDEN_SIZE
,
DEPTH
,
BATCH_SIZE
,
SEQ_LENGTH
,
check_equal
def
check_linear
():
def
check_linear
():
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment