Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
edf94a35
Unverified
Commit
edf94a35
authored
Jan 10, 2024
by
Frank Lee
Committed by
GitHub
Jan 10, 2024
Browse files
[workflow] fixed build CI (#5240)
* [workflow] fixed build CI * polish * polish * polish * polish * polish
parent
41e52c1c
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
101 additions
and
156 deletions
+101
-156
.github/workflows/build_on_pr.yml
.github/workflows/build_on_pr.yml
+16
-120
.github/workflows/build_on_schedule.yml
.github/workflows/build_on_schedule.yml
+9
-6
.github/workflows/doc_test_on_schedule.yml
.github/workflows/doc_test_on_schedule.yml
+1
-1
tests/kit/model_zoo/__init__.py
tests/kit/model_zoo/__init__.py
+30
-2
tests/kit/model_zoo/registry.py
tests/kit/model_zoo/registry.py
+10
-7
tests/test_booster/test_plugin/test_gemini_plugin.py
tests/test_booster/test_plugin/test_gemini_plugin.py
+2
-2
tests/test_booster/test_plugin/test_low_level_zero_plugin.py
tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+7
-2
tests/test_booster/test_plugin/test_torch_ddp_plugin.py
tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+7
-2
tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+7
-2
tests/test_checkpoint_io/test_gemini_checkpoint_io.py
tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+7
-7
tests/test_checkpoint_io/test_gemini_torch_compability.py
tests/test_checkpoint_io/test_gemini_torch_compability.py
+1
-1
tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
...heckpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+1
-1
tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
...t_checkpoint_io/test_plugins_huggingface_compatibility.py
+1
-1
tests/test_lazy/test_models.py
tests/test_lazy/test_models.py
+2
-2
No files found.
.github/workflows/build_on_pr.yml
View file @
edf94a35
...
@@ -22,57 +22,6 @@ on:
...
@@ -22,57 +22,6 @@ on:
delete
:
delete
:
jobs
:
jobs
:
prepare_cache
:
name
:
Prepare testmon cache
if
:
|
github.event_name == 'create' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on
:
[
self-hosted
,
gpu
]
container
:
image
:
hpcaitech/pytorch-cuda:2.0.0-11.7.0
options
:
--rm
timeout-minutes
:
5
defaults
:
run
:
shell
:
bash
steps
:
-
name
:
Copy testmon cache
run
:
|
# branch name may contain slash, we need to replace it with space
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
fi
env
:
MAIN_BRANCH
:
${{ github.event.master_branch }}
prepare_cache_for_pr
:
name
:
Prepare testmon cache for PR
if
:
|
github.event_name == 'pull_request' &&
(github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on
:
[
self-hosted
,
gpu
]
container
:
image
:
hpcaitech/pytorch-cuda:2.0.0-11.7.0
options
:
--rm
timeout-minutes
:
5
defaults
:
run
:
shell
:
bash
concurrency
:
group
:
${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
cancel-in-progress
:
true
steps
:
-
name
:
Copy testmon cache
run
:
|
# branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
fi
env
:
PR_NUMBER
:
${{ github.event.number }}
detect
:
detect
:
name
:
Detect file change
name
:
Detect file change
if
:
|
if
:
|
...
@@ -140,7 +89,7 @@ jobs:
...
@@ -140,7 +89,7 @@ jobs:
if
:
needs.detect.outputs.anyLibraryFileChanged == 'true'
if
:
needs.detect.outputs.anyLibraryFileChanged == 'true'
runs-on
:
[
self-hosted
,
gpu
]
runs-on
:
[
self-hosted
,
gpu
]
container
:
container
:
image
:
hpcaitech/pytorch-cuda:2.
0
.0-1
1.7
.0
image
:
hpcaitech/pytorch-cuda:2.
1
.0-1
2.1
.0
options
:
--gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
options
:
--gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes
:
60
timeout-minutes
:
60
defaults
:
defaults
:
...
@@ -174,6 +123,7 @@ jobs:
...
@@ -174,6 +123,7 @@ jobs:
run
:
|
run
:
|
cd TensorNVMe
cd TensorNVMe
cp -p -r ./build /github/home/tensornvme_cache/
cp -p -r ./build /github/home/tensornvme_cache/
cp -p -r ./cmake-build /github/home/tensornvme_cache/
-
name
:
Checkout Colossal-AI
-
name
:
Checkout Colossal-AI
uses
:
actions/checkout@v2
uses
:
actions/checkout@v2
...
@@ -198,31 +148,27 @@ jobs:
...
@@ -198,31 +148,27 @@ jobs:
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
-
name
:
Restore Testmon Cache
run
:
|
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
fi
env
:
PR_NUMBER
:
${{ github.event.number }}
-
name
:
Execute Unit Testing
-
name
:
Execute Unit Testing
run
:
|
run
:
|
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
-m "not largedist" \
--durations=0 \
--ignore tests/test_analyzer \
--ignore tests/test_auto_parallel \
--ignore tests/test_fx \
--ignore tests/test_autochunk \
--ignore tests/test_gptq \
--ignore tests/test_infer_ops \
--ignore tests/test_legacy \
--ignore tests/test_moe \
--ignore tests/test_smoothquant \
--ignore tests/test_checkpoint_io \
tests/
env
:
env
:
DATA
:
/data/scratch/cifar-10
NCCL_SHM_DISABLE
:
1
NCCL_SHM_DISABLE
:
1
LD_LIBRARY_PATH
:
/github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LD_LIBRARY_PATH
:
/github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
TESTMON_CORE_PKGS
:
/__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
LLAMA_PATH
:
/data/scratch/llama-tiny
LLAMA_PATH
:
/data/scratch/llama-tiny
-
name
:
Store Testmon Cache
run
:
|
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
env
:
PR_NUMBER
:
${{ github.event.number }}
-
name
:
Collate artifact
-
name
:
Collate artifact
env
:
env
:
PR_NUMBER
:
${{ github.event.number }}
PR_NUMBER
:
${{ github.event.number }}
...
@@ -260,53 +206,3 @@ jobs:
...
@@ -260,53 +206,3 @@ jobs:
name
:
report
name
:
report
path
:
report/
path
:
report/
store_cache
:
name
:
Store testmon cache for PR
if
:
|
github.event_name == 'pull_request' &&
github.event.action == 'closed' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on
:
[
self-hosted
,
gpu
]
container
:
image
:
hpcaitech/pytorch-cuda:2.0.0-11.7.0
options
:
--rm
timeout-minutes
:
5
defaults
:
run
:
shell
:
bash
steps
:
-
name
:
Store testmon cache if possible
if
:
github.event.pull_request.merged ==
true
run
:
|
# branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
fi
env
:
PR_NUMBER
:
${{ github.event.pull_request.number }}
-
name
:
Remove testmon cache
run
:
|
rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
env
:
PR_NUMBER
:
${{ github.event.pull_request.number }}
remove_cache
:
name
:
Remove testmon cache
if
:
|
github.event_name == 'delete' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on
:
[
self-hosted
,
gpu
]
container
:
image
:
hpcaitech/pytorch-cuda:2.0.0-11.7.0
options
:
--rm
timeout-minutes
:
5
defaults
:
run
:
shell
:
bash
steps
:
-
name
:
Remove testmon cache
run
:
|
# branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
rm -rf "/github/home/testmon_cache/${BASE}"
.github/workflows/build_on_schedule.yml
View file @
edf94a35
...
@@ -10,20 +10,20 @@ jobs:
...
@@ -10,20 +10,20 @@ jobs:
build
:
build
:
name
:
Build and Test Colossal-AI
name
:
Build and Test Colossal-AI
if
:
github.repository == 'hpcaitech/ColossalAI'
if
:
github.repository == 'hpcaitech/ColossalAI'
runs-on
:
[
self-hosted
,
8-
gpu
]
runs-on
:
[
self-hosted
,
gpu
]
container
:
container
:
image
:
hpcaitech/pytorch-cuda:2.0.0-11.7.0
image
:
hpcaitech/pytorch-cuda:2.0.0-11.7.0
options
:
--gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
options
:
--gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes
:
4
0
timeout-minutes
:
9
0
steps
:
steps
:
-
name
:
Check GPU Availability
# ensure all GPUs have enough memory
-
name
:
Check GPU Availability
# ensure all GPUs have enough memory
id
:
check-avai
id
:
check-avai
run
:
|
run
:
|
avai=true
avai=true
for i in $(seq 0
7
);
for i in $(seq 0
3
);
do
do
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -gt "
10
000" ] && avai=false
[ "$gpu_used" -gt "
2
000" ] && avai=false
done
done
echo "GPU is available: $avai"
echo "GPU is available: $avai"
...
@@ -60,9 +60,12 @@ jobs:
...
@@ -60,9 +60,12 @@ jobs:
-
name
:
Unit Testing
-
name
:
Unit Testing
if
:
steps.check-avai.outputs.avai == 'true'
if
:
steps.check-avai.outputs.avai == 'true'
run
:
|
run
:
|
PYTHONPATH=$PWD pytest --durations=0 tests
PYTHONPATH=$PWD pytest \
-m "not largedist" \
--durations=0 \
tests/
env
:
env
:
DATA
:
/data/scratch/cifar-10
NCCL_SHM_DISABLE
:
1
LD_LIBRARY_PATH
:
/github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LD_LIBRARY_PATH
:
/github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH
:
/data/scratch/llama-tiny
LLAMA_PATH
:
/data/scratch/llama-tiny
...
...
.github/workflows/doc_test_on_schedule.yml
View file @
edf94a35
...
@@ -12,7 +12,7 @@ jobs:
...
@@ -12,7 +12,7 @@ jobs:
name
:
Test the changed Doc
name
:
Test the changed Doc
runs-on
:
[
self-hosted
,
gpu
]
runs-on
:
[
self-hosted
,
gpu
]
container
:
container
:
image
:
hpcaitech/pytorch-cuda:2.
0
.0-1
1.7
.0
image
:
hpcaitech/pytorch-cuda:2.
1
.0-1
2.1
.0
options
:
--gpus all --rm
options
:
--gpus all --rm
timeout-minutes
:
60
timeout-minutes
:
60
steps
:
steps
:
...
...
tests/kit/model_zoo/__init__.py
View file @
edf94a35
from
.
import
custom
,
diffusers
,
timm
,
torchaudio
,
torchrec
,
torchvision
,
transformers
import
os
from
.
import
custom
,
diffusers
,
timm
,
torchaudio
,
torchvision
,
transformers
from
.executor
import
run_fwd
,
run_fwd_bwd
from
.executor
import
run_fwd
,
run_fwd_bwd
from
.registry
import
model_zoo
from
.registry
import
model_zoo
__all__
=
[
"model_zoo"
,
"run_fwd"
,
"run_fwd_bwd"
]
# We pick a subset of models for fast testing in order to reduce the total testing time
COMMON_MODELS
=
[
'custom_hanging_param_model'
,
'custom_nested_model'
,
'custom_repeated_computed_layers'
,
'custom_simple_net'
,
'diffusers_clip_text_model'
,
'diffusers_auto_encoder_kl'
,
'diffusers_unet2d_model'
,
'timm_densenet'
,
'timm_resnet'
,
'timm_swin_transformer'
,
'torchaudio_wav2vec2_base'
,
'torchaudio_conformer'
,
'transformers_bert_for_masked_lm'
,
'transformers_bloom_for_causal_lm'
,
'transformers_falcon_for_causal_lm'
,
'transformers_chatglm_for_conditional_generation'
,
'transformers_llama_for_casual_lm'
,
'transformers_vit_for_masked_image_modeling'
,
'transformers_mistral_for_casual_lm'
]
IS_FAST_TEST
=
os
.
environ
.
get
(
'FAST_TEST'
,
'0'
)
==
'1'
__all__
=
[
"model_zoo"
,
"run_fwd"
,
"run_fwd_bwd"
,
'COMMON_MODELS'
,
'IS_FAST_TEST'
]
tests/kit/model_zoo/registry.py
View file @
edf94a35
#!/usr/bin/env python
#!/usr/bin/env python
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Callable
from
typing
import
Callable
,
List
,
Union
__all__
=
[
"ModelZooRegistry"
,
"ModelAttribute"
,
"model_zoo"
]
__all__
=
[
"ModelZooRegistry"
,
"ModelAttribute"
,
"model_zoo"
]
...
@@ -61,7 +61,7 @@ class ModelZooRegistry(dict):
...
@@ -61,7 +61,7 @@ class ModelZooRegistry(dict):
"""
"""
self
[
name
]
=
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
loss_fn
,
model_attribute
)
self
[
name
]
=
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
loss_fn
,
model_attribute
)
def
get_sub_registry
(
self
,
keyword
:
str
):
def
get_sub_registry
(
self
,
keyword
:
Union
[
str
,
List
[
str
]]
):
"""
"""
Get a sub registry with models that contain the keyword.
Get a sub registry with models that contain the keyword.
...
@@ -70,12 +70,15 @@ class ModelZooRegistry(dict):
...
@@ -70,12 +70,15 @@ class ModelZooRegistry(dict):
"""
"""
new_dict
=
dict
()
new_dict
=
dict
()
if
isinstance
(
keyword
,
str
):
keyword_list
=
[
keyword
]
else
:
keyword_list
=
keyword
assert
isinstance
(
keyword_list
,
(
list
,
tuple
))
for
k
,
v
in
self
.
items
():
for
k
,
v
in
self
.
items
():
if
keyword
==
"transformers_gpt"
:
for
kw
in
keyword_list
:
if
keyword
in
k
and
not
"gptj"
in
k
:
# ensure GPT2 does not retrieve GPTJ models
if
kw
in
k
:
new_dict
[
k
]
=
v
else
:
if
keyword
in
k
:
new_dict
[
k
]
=
v
new_dict
[
k
]
=
v
assert
len
(
new_dict
)
>
0
,
f
"No model found with keyword
{
keyword
}
"
assert
len
(
new_dict
)
>
0
,
f
"No model found with keyword
{
keyword
}
"
...
...
tests/test_booster/test_plugin/test_gemini_plugin.py
View file @
edf94a35
...
@@ -13,7 +13,7 @@ from colossalai.lazy.lazy_init import LazyInitContext
...
@@ -13,7 +13,7 @@ from colossalai.lazy.lazy_init import LazyInitContext
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.tensor.colo_parameter
import
ColoParameter
from
colossalai.tensor.colo_parameter
import
ColoParameter
from
colossalai.testing
import
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
tests.kit.model_zoo
import
model_zoo
from
tests.kit.model_zoo
import
model_zoo
,
COMMON_MODELS
,
IS_FAST_TEST
def
run_fn
(
init_method
,
model_fn
,
data_gen_fn
,
output_transform_fn
,
zero_size
,
tp_size
)
->
Optional
[
str
]:
def
run_fn
(
init_method
,
model_fn
,
data_gen_fn
,
output_transform_fn
,
zero_size
,
tp_size
)
->
Optional
[
str
]:
...
@@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
...
@@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
# @parameterize('init_method', ['lazy', 'none', 'colo'])
# @parameterize('init_method', ['lazy', 'none', 'colo'])
@
parameterize
(
"subset"
,
[
"torchvision"
,
"transformers"
,
"diffusers"
])
@
parameterize
(
"subset"
,
[
COMMON_MODELS
]
if
IS_FAST_TEST
else
[
"torchvision"
,
"transformers"
,
"diffusers"
])
@
parameterize
(
"init_method"
,
[
"none"
])
@
parameterize
(
"init_method"
,
[
"none"
])
@
parameterize
(
"zero_size"
,
[
2
])
@
parameterize
(
"zero_size"
,
[
2
])
@
parameterize
(
"tp_size"
,
[
2
])
@
parameterize
(
"tp_size"
,
[
2
])
...
...
tests/test_booster/test_plugin/test_low_level_zero_plugin.py
View file @
edf94a35
...
@@ -11,7 +11,7 @@ from colossalai.booster.plugin import LowLevelZeroPlugin
...
@@ -11,7 +11,7 @@ from colossalai.booster.plugin import LowLevelZeroPlugin
# from colossalai.nn.optimizer import HybridAdam
# from colossalai.nn.optimizer import HybridAdam
from
colossalai.testing
import
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
tests.kit.model_zoo
import
model_zoo
from
tests.kit.model_zoo
import
model_zoo
,
IS_FAST_TEST
,
COMMON_MODELS
# These models are not compatible with AMP
# These models are not compatible with AMP
_AMP_ERR_MODELS
=
[
"timm_convit"
,
"deepfm_interactionarch"
]
_AMP_ERR_MODELS
=
[
"timm_convit"
,
"deepfm_interactionarch"
]
...
@@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
...
@@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
ignore_models
=
_AMP_ERR_MODELS
+
_LOW_LEVEL_ZERO_ERR_MODELS
+
_STUCK_MODELS
ignore_models
=
_AMP_ERR_MODELS
+
_LOW_LEVEL_ZERO_ERR_MODELS
+
_STUCK_MODELS
skipped_models
=
[]
skipped_models
=
[]
for
name
,
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
in
model_zoo
.
items
():
if
IS_FAST_TEST
:
registry
=
model_zoo
.
get_sub_registry
(
COMMON_MODELS
)
else
:
registry
=
model_zoo
for
name
,
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
in
registry
.
items
():
# FIXME(ver217): fix these models
# FIXME(ver217): fix these models
if
name
in
ignore_models
:
if
name
in
ignore_models
:
skipped_models
.
append
(
name
)
skipped_models
.
append
(
name
)
...
...
tests/test_booster/test_plugin/test_torch_ddp_plugin.py
View file @
edf94a35
...
@@ -11,7 +11,7 @@ from colossalai.booster import Booster
...
@@ -11,7 +11,7 @@ from colossalai.booster import Booster
from
colossalai.booster.plugin
import
TorchDDPPlugin
from
colossalai.booster.plugin
import
TorchDDPPlugin
from
colossalai.interface
import
OptimizerWrapper
from
colossalai.interface
import
OptimizerWrapper
from
colossalai.testing
import
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
rerun_if_address_is_in_use
,
spawn
from
tests.kit.model_zoo
import
model_zoo
from
tests.kit.model_zoo
import
model_zoo
,
IS_FAST_TEST
,
COMMON_MODELS
def
run_fn
(
model_fn
,
data_gen_fn
,
output_transform_fn
):
def
run_fn
(
model_fn
,
data_gen_fn
,
output_transform_fn
):
...
@@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
...
@@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
def
check_torch_ddp_plugin
():
def
check_torch_ddp_plugin
():
for
name
,
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
in
model_zoo
.
items
():
if
IS_FAST_TEST
:
registry
=
model_zoo
.
get_sub_registry
(
COMMON_MODELS
)
else
:
registry
=
model_zoo
for
name
,
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
in
registry
.
items
():
if
name
==
"dlrm_interactionarch"
:
if
name
==
"dlrm_interactionarch"
:
continue
continue
run_fn
(
model_fn
,
data_gen_fn
,
output_transform_fn
)
run_fn
(
model_fn
,
data_gen_fn
,
output_transform_fn
)
...
...
tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
View file @
edf94a35
...
@@ -12,7 +12,7 @@ if version.parse(torch.__version__) >= version.parse("1.12.0"):
...
@@ -12,7 +12,7 @@ if version.parse(torch.__version__) >= version.parse("1.12.0"):
from
colossalai.interface
import
OptimizerWrapper
from
colossalai.interface
import
OptimizerWrapper
from
colossalai.testing
import
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
rerun_if_address_is_in_use
,
spawn
from
tests.kit.model_zoo
import
model_zoo
from
tests.kit.model_zoo
import
model_zoo
,
IS_FAST_TEST
,
COMMON_MODELS
# test basic fsdp function
# test basic fsdp function
...
@@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
...
@@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
def
check_torch_fsdp_plugin
():
def
check_torch_fsdp_plugin
():
for
name
,
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
in
model_zoo
.
items
():
if
IS_FAST_TEST
:
registry
=
model_zoo
.
get_sub_registry
(
COMMON_MODELS
)
else
:
registry
=
model_zoo
for
name
,
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
in
registry
.
items
():
if
any
(
if
any
(
element
in
name
element
in
name
for
element
in
[
for
element
in
[
...
...
tests/test_checkpoint_io/test_gemini_checkpoint_io.py
View file @
edf94a35
...
@@ -7,6 +7,7 @@ from transformers import LlamaForCausalLM
...
@@ -7,6 +7,7 @@ from transformers import LlamaForCausalLM
from
utils
import
shared_tempdir
from
utils
import
shared_tempdir
import
colossalai
import
colossalai
from
colossalai.testing
import
skip_if_not_enough_gpus
from
colossalai.booster
import
Booster
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
from
colossalai.booster.plugin
import
GeminiPlugin
from
colossalai.lazy
import
LazyInitContext
from
colossalai.lazy
import
LazyInitContext
...
@@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
...
@@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
@
clear_cache_before_run
()
@
clear_cache_before_run
()
@
parameterize
(
"placement_config"
,
OPTIM_PLACEMENT_CONFIGS
)
@
parameterize
(
"placement_config"
,
OPTIM_PLACEMENT_CONFIGS
)
@
parameterize
(
"shard"
,
[
True
,
False
])
@
parameterize
(
"shard"
,
[
True
,
False
])
@
parameterize
(
"model_name"
,
[
"transformers_
gpt
"
])
@
parameterize
(
"model_name"
,
[
"transformers_
llama_for_casual_lm
"
])
@
parameterize
(
"size_per_shard"
,
[
32
])
@
parameterize
(
"size_per_shard"
,
[
32
])
@
parameterize
(
"tp_size"
,
[
1
,
2
])
@
parameterize
(
"tp_size"
,
[
1
,
2
])
@
parameterize
(
"zero_size"
,
[
2
])
@
parameterize
(
"zero_size"
,
[
2
])
...
@@ -156,13 +157,12 @@ def run_dist(rank, world_size, port):
...
@@ -156,13 +157,12 @@ def run_dist(rank, world_size, port):
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
4
])
@
rerun_if_address_is_in_use
()
@
rerun_if_address_is_in_use
()
def
test_gemini_ckpIO
(
world_size
):
def
test_gemini_ckpIO
():
spawn
(
run_dist
,
world_size
)
spawn
(
run_dist
,
4
)
@
pytest
.
mark
.
largedist
@
pytest
.
mark
.
largedist
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
8
]
)
@
skip_if_not_enough_gpus
(
min_gpus
=
8
)
@
rerun_if_address_is_in_use
()
@
rerun_if_address_is_in_use
()
def
test_gemini_ckpIO_3d
(
world_size
):
def
test_gemini_ckpIO_3d
():
spawn
(
run_dist
,
world_size
)
spawn
(
run_dist
,
8
)
\ No newline at end of file
\ No newline at end of file
tests/test_checkpoint_io/test_gemini_torch_compability.py
View file @
edf94a35
...
@@ -20,7 +20,7 @@ from tests.kit.model_zoo import model_zoo
...
@@ -20,7 +20,7 @@ from tests.kit.model_zoo import model_zoo
@
clear_cache_before_run
()
@
clear_cache_before_run
()
@
parameterize
(
"shard"
,
[
False
,
True
])
@
parameterize
(
"shard"
,
[
False
,
True
])
@
parameterize
(
"model_name"
,
[
"transformers_
gpt
"
])
@
parameterize
(
"model_name"
,
[
"transformers_
llama_for_casual_lm
"
])
def
exam_torch_load_from_gemini
(
shard
:
bool
,
model_name
:
str
):
def
exam_torch_load_from_gemini
(
shard
:
bool
,
model_name
:
str
):
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
=
next
(
iter
(
model_zoo
.
get_sub_registry
(
model_name
).
values
()))
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
_
,
_
)
=
next
(
iter
(
model_zoo
.
get_sub_registry
(
model_name
).
values
()))
criterion
=
lambda
x
:
x
.
mean
()
criterion
=
lambda
x
:
x
.
mean
()
...
...
tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
View file @
edf94a35
...
@@ -40,7 +40,7 @@ else:
...
@@ -40,7 +40,7 @@ else:
@
clear_cache_before_run
()
@
clear_cache_before_run
()
@
parameterize
(
"shard"
,
[
True
,
False
])
@
parameterize
(
"shard"
,
[
True
,
False
])
@
parameterize
(
"model_name"
,
[
"transformers_
gpt
"
])
@
parameterize
(
"model_name"
,
[
"transformers_
llama_for_casual_lm
"
])
@
parameterize
(
"size_per_shard"
,
[
32
])
@
parameterize
(
"size_per_shard"
,
[
32
])
@
parameterize
(
"test_config"
,
TEST_CONFIGS
)
@
parameterize
(
"test_config"
,
TEST_CONFIGS
)
def
exam_state_dict
(
shard
:
bool
,
model_name
:
str
,
size_per_shard
:
int
,
test_config
:
dict
):
def
exam_state_dict
(
shard
:
bool
,
model_name
:
str
,
size_per_shard
:
int
,
test_config
:
dict
):
...
...
tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
View file @
edf94a35
...
@@ -18,7 +18,7 @@ from tests.kit.model_zoo import model_zoo
...
@@ -18,7 +18,7 @@ from tests.kit.model_zoo import model_zoo
@
clear_cache_before_run
()
@
clear_cache_before_run
()
@
parameterize
(
"model_name"
,
[
"transformers_
gpt
"
])
@
parameterize
(
"model_name"
,
[
"transformers_
llama_for_casual_lm
"
])
@
parameterize
(
"plugin_type"
,
[
"ddp"
,
"zero"
,
"gemini"
])
@
parameterize
(
"plugin_type"
,
[
"ddp"
,
"zero"
,
"gemini"
])
def
exam_from_pretrained
(
plugin_type
:
str
,
model_name
:
str
,
shard
=
True
,
size_per_shard
=
32
):
def
exam_from_pretrained
(
plugin_type
:
str
,
model_name
:
str
,
shard
=
True
,
size_per_shard
=
32
):
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
loss_fn
,
_
)
=
next
(
(
model_fn
,
data_gen_fn
,
output_transform_fn
,
loss_fn
,
_
)
=
next
(
...
...
tests/test_lazy/test_models.py
View file @
edf94a35
import
pytest
import
pytest
from
lazy_init_utils
import
SUPPORT_LAZY
,
check_lazy_init
from
lazy_init_utils
import
SUPPORT_LAZY
,
check_lazy_init
from
tests.kit.model_zoo
import
model_zoo
from
tests.kit.model_zoo
import
model_zoo
,
IS_FAST_TEST
,
COMMON_MODELS
@
pytest
.
mark
.
skipif
(
not
SUPPORT_LAZY
,
reason
=
"requires torch >= 1.12.0"
)
@
pytest
.
mark
.
skipif
(
not
SUPPORT_LAZY
,
reason
=
"requires torch >= 1.12.0"
)
@
pytest
.
mark
.
parametrize
(
"subset"
,
[
"torchvision"
,
"diffusers"
,
"timm"
,
"transformers"
,
"torchaudio"
,
"deepfm"
,
"dlrm"
])
@
pytest
.
mark
.
parametrize
(
"subset"
,
[
COMMON_MODELS
]
if
IS_FAST_TEST
else
[
"torchvision"
,
"diffusers"
,
"timm"
,
"transformers"
,
"torchaudio"
,
"deepfm"
,
"dlrm"
])
@
pytest
.
mark
.
parametrize
(
"default_device"
,
[
"cpu"
,
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"default_device"
,
[
"cpu"
,
"cuda"
])
def
test_torchvision_models_lazy_init
(
subset
,
default_device
):
def
test_torchvision_models_lazy_init
(
subset
,
default_device
):
sub_model_zoo
=
model_zoo
.
get_sub_registry
(
subset
)
sub_model_zoo
=
model_zoo
.
get_sub_registry
(
subset
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment