Unverified Commit d787935a authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[s2s] test_distributed_eval (#8315)


Co-authored-by: default avatarSam Shleifer <sshleifer@gmail.com>
parent 04e442d5
...@@ -450,7 +450,8 @@ Inside tests: ...@@ -450,7 +450,8 @@ Inside tests:
.. code-block:: bash .. code-block:: bash
torch.cuda.device_count() from transformers.testing_utils import get_gpu_count
n_gpu = get_gpu_count() # works with torch and tf
......
...@@ -2,9 +2,9 @@ import os ...@@ -2,9 +2,9 @@ import os
import sys import sys
from unittest.mock import patch from unittest.mock import patch
from transformers import BertTokenizer, EncoderDecoderModel, is_torch_available from transformers import BertTokenizer, EncoderDecoderModel
from transformers.file_utils import is_datasets_available from transformers.file_utils import is_datasets_available
from transformers.testing_utils import TestCasePlus, execute_subprocess_async, slow from transformers.testing_utils import TestCasePlus, execute_subprocess_async, get_gpu_count, slow
from transformers.trainer_callback import TrainerState from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed from transformers.trainer_utils import set_seed
...@@ -13,9 +13,6 @@ from .seq2seq_trainer import Seq2SeqTrainer ...@@ -13,9 +13,6 @@ from .seq2seq_trainer import Seq2SeqTrainer
from .test_seq2seq_examples import MBART_TINY from .test_seq2seq_examples import MBART_TINY
if is_torch_available():
import torch
set_seed(42) set_seed(42)
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1" MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
...@@ -196,7 +193,7 @@ class TestFinetuneTrainer(TestCasePlus): ...@@ -196,7 +193,7 @@ class TestFinetuneTrainer(TestCasePlus):
""".split() """.split()
# --eval_beams 2 # --eval_beams 2
n_gpu = torch.cuda.device_count() n_gpu = get_gpu_count()
if n_gpu > 1: if n_gpu > 1:
distributed_args = f""" distributed_args = f"""
-m torch.distributed.launch -m torch.distributed.launch
......
...@@ -3,7 +3,14 @@ ...@@ -3,7 +3,14 @@
import os import os
import sys import sys
from transformers.testing_utils import TestCasePlus, execute_subprocess_async, require_torch_multigpu from transformers.testing_utils import (
TestCasePlus,
execute_subprocess_async,
get_gpu_count,
require_torch_gpu,
require_torch_multigpu,
slow,
)
from .test_seq2seq_examples import CHEAP_ARGS, make_test_data_dir from .test_seq2seq_examples import CHEAP_ARGS, make_test_data_dir
from .utils import load_json from .utils import load_json
...@@ -80,3 +87,30 @@ class TestSummarizationDistillerMultiGPU(TestCasePlus): ...@@ -80,3 +87,30 @@ class TestSummarizationDistillerMultiGPU(TestCasePlus):
self.assertEqual(len(metrics["test"]), 1) self.assertEqual(len(metrics["test"]), 1)
desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1) desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1)
self.assertEqual(len(metrics["val"]), desired_n_evals) self.assertEqual(len(metrics["val"]), desired_n_evals)
@slow
@require_torch_gpu
def test_distributed_eval(self):
output_dir = self.get_auto_remove_tmp_dir()
args = f"""
--model_name Helsinki-NLP/opus-mt-en-ro
--save_dir {output_dir}
--data_dir test_data/wmt_en_ro
--num_beams 2
--task translation
""".split()
# we want this test to run even if there is only one GPU, but if there are more we use them all
n_gpu = get_gpu_count()
distributed_args = f"""
-m torch.distributed.launch
--nproc_per_node={n_gpu}
{self.test_file_dir}/run_distributed_eval.py
""".split()
cmd = [sys.executable] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
metrics_save_path = os.path.join(output_dir, "test_bleu.json")
metrics = load_json(metrics_save_path)
# print(metrics)
self.assertGreaterEqual(metrics["bleu"], 25)
...@@ -297,6 +297,22 @@ def require_ray(test_case): ...@@ -297,6 +297,22 @@ def require_ray(test_case):
return test_case return test_case
def get_gpu_count():
"""
Return the number of available gpus (regardless of whether torch or tf is used)
"""
if _torch_available:
import torch
return torch.cuda.device_count()
elif _tf_available:
import tensorflow as tf
return len(tf.config.list_physical_devices("GPU"))
else:
return 0
def get_tests_dir(append_path=None): def get_tests_dir(append_path=None):
""" """
Args: Args:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment