import os from pathlib import Path import pytest import torch import torch.distributed from megatron.core.utils import is_te_min_version from tests.unit_tests.dist_checkpointing import TempNamedDir from tests.unit_tests.test_utilities import Utils def pytest_sessionfinish(session, exitstatus): if exitstatus == 5: session.exitstatus = 0 @pytest.fixture(scope="session", autouse=True) def cleanup(): yield if torch.distributed.is_initialized(): torch.distributed.barrier() torch.distributed.destroy_process_group() @pytest.fixture(scope="function", autouse=True) def set_env(): if is_te_min_version("1.3"): os.environ['NVTE_FLASH_ATTN'] = '0' os.environ['NVTE_FUSED_ATTN'] = '0' @pytest.fixture(scope="session") def tmp_path_dist_ckpt(tmp_path_factory) -> Path: """Common directory for saving the checkpoint. Can't use pytest `tmp_path_factory` directly because directory must be shared between processes. """ tmp_dir = tmp_path_factory.mktemp('ignored', numbered=False) tmp_dir = tmp_dir.parent.parent / 'tmp_dist_ckpt' if Utils.rank == 0: with TempNamedDir(tmp_dir, sync=False): yield tmp_dir else: yield tmp_dir