[ci] fixed booster test (#5251)

* [ci] fixed booster test * [ci] fixed booster test * [ci] fixed booster test

[ci] fixed booster test (#5251)
* [ci] fixed booster test * [ci] fixed booster test * [ci] fixed booster test
d5eeeb14 · Frank Lee · GitHub · edf94a35 · d5eeeb14 · d5eeeb14
Unverified Commit d5eeeb14 authored Jan 11, 2024 by Frank Lee Committed by GitHub Jan 11, 2024
5 changed files
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -90,7 +90,7 @@ jobs:
    runs-on: [self-hosted, gpu]
    container:
      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
    timeout-minutes: 60
    defaults:
      run:
@@ -165,7 +165,6 @@ jobs:
          --ignore tests/test_checkpoint_io \
          tests/
        env:
-          NCCL_SHM_DISABLE: 1
          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
          LLAMA_PATH: /data/scratch/llama-tiny

@@ -205,4 +204,3 @@ jobs:
        with:
          name: report
          path: report/
-
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -13,15 +13,16 @@ jobs:
    runs-on: [self-hosted, gpu]
    container:
      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
    timeout-minutes: 90
    steps:
      - name: Check GPU Availability # ensure all GPUs have enough memory
        id: check-avai
        run: |
          avai=true
-          for i in $(seq 0 3);
-          do
+          ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+          endIndex=$(($ngpu-1))
+          for i in $(seq 0 $endIndex);
            gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
            [ "$gpu_used" -gt "2000" ] && avai=false
          done
@@ -74,7 +75,7 @@ jobs:
        if: ${{ failure() }}
        run: |
          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
-          msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
+          msg="Scheduled Build and Test failed, please visit $url for details"
          echo $msg
          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
        env:

--- a/tests/kit/model_zoo/transformers/chatglm2.py
+++ b/tests/kit/model_zoo/transformers/chatglm2.py
@@ -2,7 +2,6 @@ import torch

 from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
 from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
-
 from ..registry import ModelAttribute, model_zoo

 # ================================

--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -10,10 +10,11 @@ from colossalai.booster.plugin import HybridParallelPlugin
 from colossalai.fx import is_compatible_with_meta
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import model_zoo


+@clear_cache_before_run()
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
    try:
        if init_method == "lazy":
@@ -69,7 +70,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
        "transformers_llama_for_casual_lm"
    ).items():
        err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
-        torch.cuda.empty_cache()

        if err is None:
            passed_models.append(name)

--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -12,10 +12,11 @@ from colossalai.fx import is_compatible_with_meta
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo


+@clear_cache_before_run()
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
    try:
        if init_method == "lazy":
@@ -145,7 +146,6 @@ def check_gemini_plugin(
            tp_size = 1

        err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size)
-        torch.cuda.empty_cache()
        if err is None:
            passed_models.append(name)
        else: