Unverified Commit 8321f682 authored by Benjamin Lefaudeux's avatar Benjamin Lefaudeux Committed by GitHub
Browse files

[fix] Hopeful Circleci hangfix - teardown if raising exception (#280)

* timeout on the process join, expose a hanging process
* make sure that teardown is always called
parent fb8d9137
...@@ -144,7 +144,8 @@ def spawn_for_all_world_sizes(test_func: Callable, world_sizes: List[int] = get_ ...@@ -144,7 +144,8 @@ def spawn_for_all_world_sizes(test_func: Callable, world_sizes: List[int] = get_
for world_size in world_sizes: for world_size in world_sizes:
filename = tempfile.mkstemp()[1] filename = tempfile.mkstemp()[1]
mp.spawn(test_func, args=(world_size, filename, *args), nprocs=world_size, join=True) # type: ignore context = mp.spawn(test_func, args=(world_size, filename, *args), nprocs=world_size, join=False) # type: ignore
context.join(timeout=60.0)
def worker_process(rank: int, world_size: int, filename: str, func: Callable, args: Any, error_queue: Any) -> None: def worker_process(rank: int, world_size: int, filename: str, func: Callable, args: Any, error_queue: Any) -> None:
...@@ -159,19 +160,19 @@ def worker_process(rank: int, world_size: int, filename: str, func: Callable, ar ...@@ -159,19 +160,19 @@ def worker_process(rank: int, world_size: int, filename: str, func: Callable, ar
initialize_model_parallel(1, world_size, **kwargs) initialize_model_parallel(1, world_size, **kwargs)
try: try:
func(*args) func(*args)
teardown()
except BaseException as e: except BaseException as e:
# Make sure that the group is properly destroyed, even for tests which check for exceptions being raised
teardown()
# If the function raises 'Skipped', this indicates pytest.skip(), so # If the function raises 'Skipped', this indicates pytest.skip(), so
# forward it to parent so we can call pytest.skip() there # forward it to parent so we can call pytest.skip() there
if e.__class__.__name__ == "Skipped": if e.__class__.__name__ == "Skipped":
error_queue.put(str(e)) error_queue.put(str(e))
return return
# Make sure that the group is properly destroyed, even for tests which check for exceptions being raised
teardown()
raise e raise e
teardown()
def teardown() -> None: def teardown() -> None:
destroy_model_parallel() destroy_model_parallel()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment