Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
65ee6dcc
Unverified
Commit
65ee6dcc
authored
Jun 08, 2022
by
Frank Lee
Committed by
GitHub
Jun 08, 2022
Browse files
[test] ignore 8 gpu test (#1080)
* [test] ignore 8 gpu test * polish code * polish workflow * polish workflow
parent
0653c63e
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
15 additions
and
8 deletions
+15
-8
.github/workflows/build.yml
.github/workflows/build.yml
+2
-2
tests/components_to_test/gpt.py
tests/components_to_test/gpt.py
+5
-3
tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
...e_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
+1
-0
tests/test_layers/test_3d/test_3d.py
tests/test_layers/test_3d/test_3d.py
+1
-0
tests/test_tensor/test_model.py
tests/test_tensor/test_model.py
+0
-1
tests/test_utils/test_checkpoint/test_checkpoint_1d.py
tests/test_utils/test_checkpoint/test_checkpoint_1d.py
+1
-0
tests/test_utils/test_checkpoint/test_checkpoint_2d.py
tests/test_utils/test_checkpoint/test_checkpoint_2d.py
+1
-0
tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
+1
-0
tests/test_utils/test_checkpoint/test_checkpoint_3d.py
tests/test_utils/test_checkpoint/test_checkpoint_3d.py
+1
-0
tests/test_utils/test_memory.py
tests/test_utils/test_memory.py
+1
-1
tests/test_zero/test_tensor_utils.py
tests/test_zero/test_tensor_utils.py
+1
-1
No files found.
.github/workflows/build.yml
View file @
65ee6dcc
...
@@ -15,7 +15,7 @@ jobs:
...
@@ -15,7 +15,7 @@ jobs:
runs-on
:
[
self-hosted
,
gpu
]
runs-on
:
[
self-hosted
,
gpu
]
container
:
container
:
image
:
hpcaitech/pytorch-cuda:1.10.1-11.3.0
image
:
hpcaitech/pytorch-cuda:1.10.1-11.3.0
options
:
--gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
options
:
--shm-size=2gb
--gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes
:
40
timeout-minutes
:
40
steps
:
steps
:
-
uses
:
actions/checkout@v2
-
uses
:
actions/checkout@v2
...
@@ -25,10 +25,10 @@ jobs:
...
@@ -25,10 +25,10 @@ jobs:
run
:
|
run
:
|
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
pip install -r requirements/requirements.txt
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-test.txt
pip install -v -e .
pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/
cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt
-
name
:
Unit Testing
-
name
:
Unit Testing
run
:
|
run
:
|
PYTHONPATH=$PWD pytest tests
PYTHONPATH=$PWD pytest tests
...
...
tests/components_to_test/gpt.py
View file @
65ee6dcc
...
@@ -7,9 +7,9 @@ from colossalai.utils.cuda import get_current_device
...
@@ -7,9 +7,9 @@ from colossalai.utils.cuda import get_current_device
class
DummyDataLoader
(
DummyDataGenerator
):
class
DummyDataLoader
(
DummyDataGenerator
):
vocab_size
=
50304
vocab_size
=
128
batch_size
=
4
batch_size
=
4
seq_len
=
102
4
seq_len
=
6
4
def
generate
(
self
):
def
generate
(
self
):
input_ids
=
torch
.
randint
(
0
,
input_ids
=
torch
.
randint
(
0
,
...
@@ -47,6 +47,8 @@ class GPTLMModel(nn.Module):
...
@@ -47,6 +47,8 @@ class GPTLMModel(nn.Module):
# Only return lm_logits
# Only return lm_logits
return
self
.
model
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
use_cache
=
not
self
.
checkpoint
)[
0
]
return
self
.
model
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
use_cache
=
not
self
.
checkpoint
)[
0
]
def
gpt2_micro
(
checkpoint
=
True
):
return
GPTLMModel
(
checkpoint
=
checkpoint
,
hidden_size
=
32
,
num_layers
=
2
,
num_attention_heads
=
4
,
max_seq_len
=
64
,
vocab_size
=
128
)
def
gpt2_s
(
checkpoint
=
True
):
def
gpt2_s
(
checkpoint
=
True
):
return
GPTLMModel
(
checkpoint
=
checkpoint
)
return
GPTLMModel
(
checkpoint
=
checkpoint
)
...
@@ -76,4 +78,4 @@ def get_training_components():
...
@@ -76,4 +78,4 @@ def get_training_components():
testloader
=
DummyDataLoader
()
testloader
=
DummyDataLoader
()
criterion
=
GPTLMLoss
()
criterion
=
GPTLMLoss
()
return
gpt2_
s
,
trainloader
,
testloader
,
torch
.
optim
.
Adam
,
criterion
return
gpt2_
micro
,
trainloader
,
testloader
,
torch
.
optim
.
Adam
,
criterion
tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
View file @
65ee6dcc
...
@@ -83,6 +83,7 @@ def run_trainer(rank, world_size, port):
...
@@ -83,6 +83,7 @@ def run_trainer(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test requires 8 GPUs to execute"
)
@
rerun_if_address_is_in_use
()
@
rerun_if_address_is_in_use
()
def
test_hybrid_parallel
():
def
test_hybrid_parallel
():
world_size
=
8
world_size
=
8
...
...
tests/test_layers/test_3d/test_3d.py
View file @
65ee6dcc
...
@@ -51,6 +51,7 @@ def check_layer_and_operation(rank, world_size, port):
...
@@ -51,6 +51,7 @@ def check_layer_and_operation(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test requires 8 GPUs to execute"
)
@
rerun_if_address_is_in_use
()
@
rerun_if_address_is_in_use
()
def
test_3d
():
def
test_3d
():
world_size
=
8
world_size
=
8
...
...
tests/test_tensor/test_model.py
View file @
65ee6dcc
...
@@ -328,7 +328,6 @@ def run_model_dist(rank, world_size, port):
...
@@ -328,7 +328,6 @@ def run_model_dist(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
parametrize
(
'world_size'
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
'world_size'
,
[
1
,
4
])
# @parameterize('world_size', [1, 4])
@
rerun_if_address_is_in_use
()
@
rerun_if_address_is_in_use
()
def
test_model
(
world_size
):
def
test_model
(
world_size
):
run_func
=
partial
(
run_model_dist
,
world_size
=
world_size
,
port
=
free_port
())
run_func
=
partial
(
run_model_dist
,
world_size
=
world_size
,
port
=
free_port
())
...
...
tests/test_utils/test_checkpoint/test_checkpoint_1d.py
View file @
65ee6dcc
...
@@ -67,6 +67,7 @@ def check_checkpoint_1d(rank, world_size, port):
...
@@ -67,6 +67,7 @@ def check_checkpoint_1d(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test should be invoked with 8 GPUs"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
def
test_checkpoint_1d
():
def
test_checkpoint_1d
():
world_size
=
8
world_size
=
8
...
...
tests/test_utils/test_checkpoint/test_checkpoint_2d.py
View file @
65ee6dcc
...
@@ -67,6 +67,7 @@ def check_checkpoint_2d(rank, world_size, port):
...
@@ -67,6 +67,7 @@ def check_checkpoint_2d(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test should be invoked with 8 GPUs"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
def
test_checkpoint_2d
():
def
test_checkpoint_2d
():
world_size
=
8
world_size
=
8
...
...
tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
View file @
65ee6dcc
...
@@ -67,6 +67,7 @@ def check_checkpoint_2p5d(rank, world_size, port):
...
@@ -67,6 +67,7 @@ def check_checkpoint_2p5d(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test should be invoked with 8 GPUs"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
def
test_checkpoint_2p5d
():
def
test_checkpoint_2p5d
():
world_size
=
8
world_size
=
8
...
...
tests/test_utils/test_checkpoint/test_checkpoint_3d.py
View file @
65ee6dcc
...
@@ -67,6 +67,7 @@ def check_checkpoint_3d(rank, world_size, port):
...
@@ -67,6 +67,7 @@ def check_checkpoint_3d(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test requires 8 GPUs to execute"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
def
test_checkpoint_3d
():
def
test_checkpoint_3d
():
world_size
=
8
world_size
=
8
...
...
tests/test_utils/test_memory.py
View file @
65ee6dcc
...
@@ -22,7 +22,7 @@ def run_dist(rank, world_size, port):
...
@@ -22,7 +22,7 @@ def run_dist(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
4
,
5
])
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
3
,
4
])
def
test_memory_utils
(
world_size
):
def
test_memory_utils
(
world_size
):
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
...
...
tests/test_zero/test_tensor_utils.py
View file @
65ee6dcc
...
@@ -85,7 +85,7 @@ def run_dist(rank, world_size, port):
...
@@ -85,7 +85,7 @@ def run_dist(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
4
,
5
])
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
2
,
4
])
@
rerun_if_address_is_in_use
()
@
rerun_if_address_is_in_use
()
def
test_zero_tensor_utils
(
world_size
):
def
test_zero_tensor_utils
(
world_size
):
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment