[tests] make `TestDeepSpeedModelZoo` device-agnostic (#31402)

* fix * use accelerator device count * ci fix

[tests] make `TestDeepSpeedModelZoo` device-agnostic (#31402)
* fix * use accelerator device count * ci fix
9454f437 · Fanli Lin · GitHub · 7977f206 · 9454f437 · 9454f437
Unverified Commit 9454f437 authored Jun 17, 2024 by Fanli Lin Committed by GitHub Jun 17, 2024
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 4 deletions

src/transformers/testing_utils.py src/transformers/testing_utils.py +4 -0

tests/deepspeed/test_model_zoo.py tests/deepspeed/test_model_zoo.py +5 -4

No files found.
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -2432,6 +2432,10 @@ if is_torch_available():
    BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed}
    BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "default": None}
    BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "default": lambda: 1}
+else:
+    BACKEND_MANUAL_SEED = {"default": None}
+    BACKEND_EMPTY_CACHE = {"default": None}
+    BACKEND_DEVICE_COUNT = {"default": lambda: 0}


 def backend_manual_seed(device: str, seed: int):

--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -23,12 +23,13 @@ from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import is_torch_available
 from transformers.testing_utils import (
    TestCasePlus,
+    backend_device_count,
    execute_subprocess_async,
-    get_gpu_count,
    get_tests_dir,
    require_deepspeed,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
+    torch_device,
 )
 from transformers.trainer_utils import set_seed

@@ -143,7 +144,7 @@ def get_launcher(distributed=False):
    # - it won't be able to handle that
    # 2. for now testing with just 2 gpus max (since some quality tests may give different
    # results with mode gpus because we use very little data)
-    num_gpus = min(2, get_gpu_count()) if distributed else 1
+    num_gpus = min(2, backend_device_count(torch_device)) if distributed else 1
    master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()

@@ -326,7 +327,7 @@ params = list(itertools.product(stages, task_cmds.keys()))

 @slow
 @require_deepspeed
-@require_torch_gpu
+@require_torch_accelerator
 class TestDeepSpeedModelZoo(TestCasePlus):
    """This class is for testing via an external script - can do multiple gpus"""