Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
d5eeeb14
Unverified
Commit
d5eeeb14
authored
Jan 11, 2024
by
Frank Lee
Committed by
GitHub
Jan 11, 2024
Browse files
[ci] fixed booster test (#5251)
* [ci] fixed booster test * [ci] fixed booster test * [ci] fixed booster test
parent
edf94a35
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
12 additions
and
14 deletions
+12
-14
.github/workflows/build_on_pr.yml
.github/workflows/build_on_pr.yml
+1
-3
.github/workflows/build_on_schedule.yml
.github/workflows/build_on_schedule.yml
+5
-4
tests/kit/model_zoo/transformers/chatglm2.py
tests/kit/model_zoo/transformers/chatglm2.py
+0
-1
tests/test_booster/test_plugin/test_3d_plugin.py
tests/test_booster/test_plugin/test_3d_plugin.py
+2
-2
tests/test_booster/test_plugin/test_gemini_plugin.py
tests/test_booster/test_plugin/test_gemini_plugin.py
+4
-4
No files found.
.github/workflows/build_on_pr.yml
View file @
d5eeeb14
...
...
@@ -90,7 +90,7 @@ jobs:
runs-on
:
[
self-hosted
,
gpu
]
container
:
image
:
hpcaitech/pytorch-cuda:2.1.0-12.1.0
options
:
--gpus all --rm -v /d
ata/scratch/cifar-10:/data/scratch/cifar-10
-v /data/scratch/llama-tiny:/data/scratch/llama-tiny
options
:
--gpus all --rm -v /d
ev/shm
-v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes
:
60
defaults
:
run
:
...
...
@@ -165,7 +165,6 @@ jobs:
--ignore tests/test_checkpoint_io \
tests/
env
:
NCCL_SHM_DISABLE
:
1
LD_LIBRARY_PATH
:
/github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH
:
/data/scratch/llama-tiny
...
...
@@ -205,4 +204,3 @@ jobs:
with
:
name
:
report
path
:
report/
.github/workflows/build_on_schedule.yml
View file @
d5eeeb14
...
...
@@ -13,15 +13,16 @@ jobs:
runs-on
:
[
self-hosted
,
gpu
]
container
:
image
:
hpcaitech/pytorch-cuda:2.0.0-11.7.0
options
:
--gpus all --rm -v /d
ata/scratch/cifar-10:/data/scratch/cifar-10
-v /data/scratch/llama-tiny:/data/scratch/llama-tiny
options
:
--gpus all --rm -v /d
ev/shm
-v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes
:
90
steps
:
-
name
:
Check GPU Availability
# ensure all GPUs have enough memory
id
:
check-avai
run
:
|
avai=true
for i in $(seq 0 3);
do
ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
endIndex=$(($ngpu-1))
for i in $(seq 0 $endIndex);
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -gt "2000" ] && avai=false
done
...
...
@@ -74,7 +75,7 @@ jobs:
if
:
${{ failure() }}
run
:
|
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
msg="Scheduled Build and Test failed
on 8 GPUs
, please visit $url for details"
msg="Scheduled Build and Test failed, please visit $url for details"
echo $msg
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
env
:
...
...
tests/kit/model_zoo/transformers/chatglm2.py
View file @
d5eeeb14
...
...
@@ -2,7 +2,6 @@ import torch
from
colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm
import
ChatGLMConfig
from
colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm
import
ChatGLMForConditionalGeneration
,
ChatGLMModel
from
..registry
import
ModelAttribute
,
model_zoo
# ================================
...
...
tests/test_booster/test_plugin/test_3d_plugin.py
View file @
d5eeeb14
...
...
@@ -10,10 +10,11 @@ from colossalai.booster.plugin import HybridParallelPlugin
from
colossalai.fx
import
is_compatible_with_meta
from
colossalai.lazy.lazy_init
import
LazyInitContext
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.testing
import
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
tests.kit.model_zoo
import
model_zoo
@
clear_cache_before_run
()
def
run_fn
(
init_method
,
model_fn
,
data_gen_fn
,
output_transform_fn
)
->
Optional
[
str
]:
try
:
if
init_method
==
"lazy"
:
...
...
@@ -69,7 +70,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
"transformers_llama_for_casual_lm"
).
items
():
err
=
run_fn
(
init_method
,
model_fn
,
data_gen_fn
,
output_transform_fn
)
torch
.
cuda
.
empty_cache
()
if
err
is
None
:
passed_models
.
append
(
name
)
...
...
tests/test_booster/test_plugin/test_gemini_plugin.py
View file @
d5eeeb14
...
...
@@ -12,10 +12,11 @@ from colossalai.fx import is_compatible_with_meta
from
colossalai.lazy.lazy_init
import
LazyInitContext
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.tensor.colo_parameter
import
ColoParameter
from
colossalai.testing
import
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
tests.kit.model_zoo
import
model_zoo
,
COMMON_MODELS
,
IS_FAST_TEST
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
tests.kit.model_zoo
import
COMMON_MODELS
,
IS_FAST_TEST
,
model_zoo
@
clear_cache_before_run
()
def
run_fn
(
init_method
,
model_fn
,
data_gen_fn
,
output_transform_fn
,
zero_size
,
tp_size
)
->
Optional
[
str
]:
try
:
if
init_method
==
"lazy"
:
...
...
@@ -145,7 +146,6 @@ def check_gemini_plugin(
tp_size
=
1
err
=
run_fn
(
init_method
,
model_fn
,
data_gen_fn
,
output_transform_fn
,
zero_size
,
tp_size
)
torch
.
cuda
.
empty_cache
()
if
err
is
None
:
passed_models
.
append
(
name
)
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment