test_parallel_backend.py

# -*- coding: utf-8 -*-

"""
Tests the parallel backend
"""
import faulthandler
import itertools
import multiprocessing
import os
import random
import subprocess
import sys
import textwrap
import threading
import unittest

import numpy as np

from numba import jit, vectorize, guvectorize, set_num_threads
from numba.tests.support import (temp_directory, override_config, TestCase, tag,
                                 skip_parfors_unsupported, linux_only)

import queue as t_queue
from numba.testing.main import _TIMEOUT as _RUNNER_TIMEOUT
from numba.core import config


_TEST_TIMEOUT = _RUNNER_TIMEOUT - 60.


# Check which backends are available
# TODO: Put this in a subprocess so the address space is kept clean
try:
    # Check it's a compatible TBB before loading it
    from numba.np.ufunc.parallel import _check_tbb_version_compatible
    _check_tbb_version_compatible()
    from numba.np.ufunc import tbbpool    # noqa: F401
    _HAVE_TBB_POOL = True
except ImportError:
    _HAVE_TBB_POOL = False

try:
    from numba.np.ufunc import omppool
    _HAVE_OMP_POOL = True
except ImportError:
    _HAVE_OMP_POOL = False

try:
    import scipy.linalg.cython_lapack    # noqa: F401
    _HAVE_LAPACK = True
except ImportError:
    _HAVE_LAPACK = False

# test skipping decorators
skip_no_omp = unittest.skipUnless(_HAVE_OMP_POOL, "OpenMP threadpool required")
skip_no_tbb = unittest.skipUnless(_HAVE_TBB_POOL, "TBB threadpool required")

_gnuomp = _HAVE_OMP_POOL and omppool.openmp_vendor == "GNU"
skip_unless_gnu_omp = unittest.skipUnless(_gnuomp, "GNU OpenMP only tests")

_windows = sys.platform.startswith('win')
_osx = sys.platform.startswith('darwin')
_32bit = sys.maxsize <= 2 ** 32
_parfors_unsupported = _32bit

_HAVE_OS_FORK = not _windows


# some functions to jit

def foo(n, v):
    return np.ones(n) + v


if _HAVE_LAPACK:
    def linalg(n, v):
        x = np.dot(np.ones((n, n)), np.ones((n, n)))
        return x + np.arange(n) + v
else:
    def linalg(n, v):
        # no way to trigger MKL without the lapack bindings.
        return np.arange(n) + v


def ufunc_foo(a, b):
    return a + b


def gufunc_foo(a, b, out):
    out[0] = a + b


class runnable(object):
    def __init__(self, **options):
        self._options = options


class jit_runner(runnable):

    def __call__(self):
        cfunc = jit(**self._options)(foo)
        a = 4
        b = 10
        expected = foo(a, b)
        got = cfunc(a, b)
        np.testing.assert_allclose(expected, got)


class mask_runner(object):
    def __init__(self, runner, mask, **options):
        self.runner = runner
        self.mask = mask

    def __call__(self):
        if self.mask:
            # Tests are all run in isolated subprocesses, so we
            # don't have to worry about this affecting other tests
            set_num_threads(self.mask)
        self.runner()


class linalg_runner(runnable):

    def __call__(self):
        cfunc = jit(**self._options)(linalg)
        a = 4
        b = 10
        expected = linalg(a, b)
        got = cfunc(a, b)
        np.testing.assert_allclose(expected, got)


class vectorize_runner(runnable):

    def __call__(self):
        cfunc = vectorize(['(f4, f4)'], **self._options)(ufunc_foo)
        a = b = np.random.random(10).astype(np.float32)
        expected = ufunc_foo(a, b)
        got = cfunc(a, b)
        np.testing.assert_allclose(expected, got)


class guvectorize_runner(runnable):

    def __call__(self):
        sig = ['(f4, f4, f4[:])']
        cfunc = guvectorize(sig, '(),()->()', **self._options)(gufunc_foo)
        a = b = np.random.random(10).astype(np.float32)
        expected = ufunc_foo(a, b)
        got = cfunc(a, b)
        np.testing.assert_allclose(expected, got)


def chooser(fnlist, **kwargs):
    q = kwargs.get('queue')
    try:
        faulthandler.enable()
        for _ in range(int(len(fnlist) * 1.5)):
            fn = random.choice(fnlist)
            fn()
    except Exception as e:
        q.put(e)


def compile_factory(parallel_class, queue_impl):
    def run_compile(fnlist):
        q = queue_impl()
        kws = {'queue': q}
        ths = [parallel_class(target=chooser, args=(fnlist,), kwargs=kws)
               for i in range(4)]
        for th in ths:
            th.start()
        for th in ths:
            th.join()
        if not q.empty():
            errors = []
            while not q.empty():
                errors.append(q.get(False))
            _msg = "Error(s) occurred in delegated runner:\n%s"
            raise RuntimeError(_msg % '\n'.join([repr(x) for x in errors]))
    return run_compile


# workers
_thread_class = threading.Thread


class _proc_class_impl(object):

    def __init__(self, method):
        self._method = method

    def __call__(self, *args, **kwargs):
        ctx = multiprocessing.get_context(self._method)
        return ctx.Process(*args, **kwargs)


def _get_mp_classes(method):
    if method == 'default':
        method = None
    ctx = multiprocessing.get_context(method)
    proc = _proc_class_impl(method)
    queue = ctx.Queue
    return proc, queue


thread_impl = compile_factory(_thread_class, t_queue.Queue)
spawn_proc_impl = compile_factory(*_get_mp_classes('spawn'))
if not _windows:
    fork_proc_impl = compile_factory(*_get_mp_classes('fork'))
    forkserver_proc_impl = compile_factory(*_get_mp_classes('forkserver'))

# this is duplication as Py27, linux uses fork, windows uses spawn, it however
# is kept like this so that when tests fail it's less confusing!
default_proc_impl = compile_factory(*_get_mp_classes('default'))


class TestParallelBackendBase(TestCase):
    """
    Base class for testing the parallel backends
    """

    all_impls = [
        jit_runner(nopython=True),
        jit_runner(nopython=True, cache=True),
        jit_runner(nopython=True, nogil=True),
        linalg_runner(nopython=True),
        linalg_runner(nopython=True, nogil=True),
        vectorize_runner(nopython=True),
        vectorize_runner(nopython=True, target='parallel'),
        vectorize_runner(nopython=True, target='parallel', cache=True),
        guvectorize_runner(nopython=True),
        guvectorize_runner(nopython=True, target='parallel'),
        guvectorize_runner(nopython=True, target='parallel', cache=True),
    ]

    if not _parfors_unsupported:
        parfor_impls = [
            jit_runner(nopython=True, parallel=True),
            jit_runner(nopython=True, parallel=True, cache=True),
            linalg_runner(nopython=True, parallel=True),
            linalg_runner(nopython=True, parallel=True, cache=True),
        ]
        all_impls.extend(parfor_impls)

    if config.NUMBA_NUM_THREADS < 2:
        # Not enough cores
        masks = []
    else:
        masks = [1, 2]

    mask_impls = []
    for impl in all_impls:
        for mask in masks:
            mask_impls.append(mask_runner(impl, mask))

    parallelism = ['threading', 'random']
    parallelism.append('multiprocessing_spawn')
    if _HAVE_OS_FORK:
        parallelism.append('multiprocessing_fork')
        parallelism.append('multiprocessing_forkserver')

    runners = {
        'concurrent_jit': [
            jit_runner(nopython=True, parallel=(not _parfors_unsupported)),
        ],
        'concurrent_vectorize': [
            vectorize_runner(nopython=True, target='parallel'),
        ],
        'concurrent_guvectorize': [
            guvectorize_runner(nopython=True, target='parallel'),
        ],
        'concurrent_mix_use': all_impls,
        'concurrent_mix_use_masks': mask_impls,
    }

    safe_backends = {'omp', 'tbb'}

    def run_compile(self, fnlist, parallelism='threading'):
        self._cache_dir = temp_directory(self.__class__.__name__)
        with override_config('CACHE_DIR', self._cache_dir):
            if parallelism == 'threading':
                thread_impl(fnlist)
            elif parallelism == 'multiprocessing_fork':
                fork_proc_impl(fnlist)
            elif parallelism == 'multiprocessing_forkserver':
                forkserver_proc_impl(fnlist)
            elif parallelism == 'multiprocessing_spawn':
                spawn_proc_impl(fnlist)
            elif parallelism == 'multiprocessing_default':
                default_proc_impl(fnlist)
            elif parallelism == 'random':
                ps = [thread_impl, spawn_proc_impl]
                if _HAVE_OS_FORK:
                    ps.append(fork_proc_impl)
                    ps.append(forkserver_proc_impl)

                random.shuffle(ps)
                for impl in ps:
                    impl(fnlist)
            else:
                raise ValueError(
                    'Unknown parallelism supplied %s' % parallelism)


_specific_backends = config.THREADING_LAYER in ('omp', 'tbb', 'workqueue')


@unittest.skipUnless(_specific_backends, "Threading layer not explicit")
class TestParallelBackend(TestParallelBackendBase):
    """ These are like the numba.tests.test_threadsafety tests but designed
    instead to torture the parallel backend.
    If a suitable backend is supplied via NUMBA_THREADING_LAYER these tests
    can be run directly. This test class cannot be run using the multiprocessing
    option to the test runner (i.e. `./runtests -m`) as daemon processes cannot
    have children.
    """

    # NOTE: All tests are generated based on what a platform supports concurrent
    # execution wise from Python, irrespective of whether the native libraries
    # can actually handle the behaviour present.
    @classmethod
    def generate(cls):
        for p in cls.parallelism:
            for name, impl in cls.runners.items():
                methname = "test_" + p + '_' + name

                def methgen(impl, p):
                    def test_method(self):
                        selfproc = multiprocessing.current_process()
                        # daemonized processes cannot have children
                        if selfproc.daemon:
                            _msg = 'daemonized processes cannot have children'
                            self.skipTest(_msg)
                        else:
                            self.run_compile(impl, parallelism=p)
                    return test_method
                fn = methgen(impl, p)
                fn.__name__ = methname
                setattr(cls, methname, fn)


TestParallelBackend.generate()


class TestInSubprocess(object):
    backends = {'tbb': skip_no_tbb,
                'omp': skip_no_omp,
                'workqueue': unittest.skipIf(False, '')}

    def run_cmd(self, cmdline, env):
        popen = subprocess.Popen(cmdline,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 env=env)
        # finish in _TEST_TIMEOUT seconds or kill it
        timeout = threading.Timer(_TEST_TIMEOUT, popen.kill)
        try:
            timeout.start()
            out, err = popen.communicate()
            if popen.returncode != 0:
                raise AssertionError(
                    "process failed with code %s: stderr follows\n%s\n" %
                    (popen.returncode, err.decode()))
            return out.decode(), err.decode()
        finally:
            timeout.cancel()
        return None, None

    def run_test_in_separate_process(self, test, threading_layer):
        env_copy = os.environ.copy()
        env_copy['NUMBA_THREADING_LAYER'] = str(threading_layer)
        cmdline = [sys.executable, "-m", "numba.runtests", test]
        return self.run_cmd(cmdline, env_copy)


class TestSpecificBackend(TestInSubprocess, TestParallelBackendBase):
    """
    This is quite contrived, for each test in the TestParallelBackend tests it
    generates a test that will run the TestParallelBackend test in a new python
    process with an environment modified to ensure a specific threadsafe backend
    is used. This is with view of testing the backends independently and in an
    isolated manner such that if they hang/crash/have issues, it doesn't kill
    the test suite.
    """
    _DEBUG = False

    @classmethod
    def _inject(cls, p, name, backend, backend_guard):
        themod = cls.__module__
        thecls = TestParallelBackend.__name__
        methname = "test_" + p + '_' + name
        injected_method = '%s.%s.%s' % (themod, thecls, methname)

        def test_template(self):
            o, e = self.run_test_in_separate_process(injected_method, backend)
            if self._DEBUG:
                print('stdout:\n "%s"\n stderr:\n "%s"' % (o, e))
            self.assertIn('OK', e)
            self.assertTrue('FAIL' not in e)
            self.assertTrue('ERROR' not in e)
        injected_test = "test_%s_%s_%s" % (p, name, backend)
        # Mark as long_running
        setattr(cls, injected_test,
                tag('long_running')(backend_guard(test_template)))

    @classmethod
    def generate(cls):
        for backend, backend_guard in cls.backends.items():
            for p in cls.parallelism:
                for name in cls.runners.keys():
                    # handle known problem cases...

                    # GNU OpenMP is not fork safe
                    if (p in ('multiprocessing_fork', 'random') and
                        backend == 'omp' and
                            sys.platform.startswith('linux')):
                        continue

                    # workqueue is not thread safe
                    if (p in ('threading', 'random') and
                            backend == 'workqueue'):
                        continue

                    cls._inject(p, name, backend, backend_guard)


TestSpecificBackend.generate()


class ThreadLayerTestHelper(TestCase):
    """
    Helper class for running an isolated piece of code based on a template
    """
    # sys path injection and separate usecase module to make sure everything
    # is importable by children of multiprocessing
    _here = "%r" % os.path.dirname(__file__)

    template = """if 1:
    import sys
    sys.path.insert(0, "%(here)r")
    import multiprocessing
    import numpy as np
    from numba import njit
    import numba
    try:
        import threading_backend_usecases
    except ImportError as e:
        print("DEBUG:", sys.path)
        raise e
    import os

    sigterm_handler = threading_backend_usecases.sigterm_handler
    busy_func = threading_backend_usecases.busy_func

    def the_test():
        %%s

    if __name__ == "__main__":
        the_test()
    """ % {'here': _here}

    def run_cmd(self, cmdline, env=None):
        if env is None:
            env = os.environ.copy()
            env['NUMBA_THREADING_LAYER'] = str("omp")
        popen = subprocess.Popen(cmdline,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 env=env)
        # finish in _TEST_TIMEOUT seconds or kill it
        timeout = threading.Timer(_TEST_TIMEOUT, popen.kill)
        try:
            timeout.start()
            out, err = popen.communicate()
            if popen.returncode != 0:
                raise AssertionError(
                    "process failed with code %s: stderr follows\n%s\n" %
                    (popen.returncode, err.decode()))
        finally:
            timeout.cancel()
        return out.decode(), err.decode()


@skip_parfors_unsupported
class TestThreadingLayerSelection(ThreadLayerTestHelper):
    """
    Checks that numba.threading_layer() reports correctly.
    """
    _DEBUG = False

    backends = {'tbb': skip_no_tbb,
                'omp': skip_no_omp,
                'workqueue': unittest.skipIf(False, '')}

    @classmethod
    def _inject(cls, backend, backend_guard):

        def test_template(self):
            body = """if 1:
                X = np.arange(1000000.)
                Y = np.arange(1000000.)
                Z = busy_func(X, Y)
                assert numba.threading_layer() == '%s'
            """
            runme = self.template % (body % backend)
            cmdline = [sys.executable, '-c', runme]
            env = os.environ.copy()
            env['NUMBA_THREADING_LAYER'] = str(backend)
            out, err = self.run_cmd(cmdline, env=env)
            if self._DEBUG:
                print(out, err)
        injected_test = "test_threading_layer_selector_%s" % backend
        setattr(cls, injected_test,
                tag("important")(backend_guard(test_template)))

    @classmethod
    def generate(cls):
        for backend, backend_guard in cls.backends.items():
            cls._inject(backend, backend_guard)


TestThreadingLayerSelection.generate()


@skip_parfors_unsupported
class TestThreadingLayerPriority(ThreadLayerTestHelper):

    def each_env_var(self, env_var: str):
        """Test setting priority via env var NUMBA_THREADING_LAYER_PRIORITY.
        """
        env = os.environ.copy()
        env['NUMBA_THREADING_LAYER'] = 'default'
        env['NUMBA_THREADING_LAYER_PRIORITY'] = env_var

        code = f"""
                import numba

                # trigger threading layer decision
                # hence catching invalid THREADING_LAYER_PRIORITY
                @numba.jit(
                    'float64[::1](float64[::1], float64[::1])',
                    nopython=True,
                    parallel=True,
                )
                def plus(x, y):
                    return x + y

                captured_envvar = list("{env_var}".split())
                assert numba.config.THREADING_LAYER_PRIORITY == \
                    captured_envvar, "priority mismatch"
                assert numba.threading_layer() == captured_envvar[0],\
                    "selected backend mismatch"
                """
        cmd = [
            sys.executable,
            '-c',
            textwrap.dedent(code),
        ]
        self.run_cmd(cmd, env=env)

    @skip_no_omp
    @skip_no_tbb
    def test_valid_env_var(self):
        default = ['tbb', 'omp', 'workqueue']
        for p in itertools.permutations(default):
            env_var = ' '.join(p)
            self.each_env_var(env_var)

    @skip_no_omp
    @skip_no_tbb
    def test_invalid_env_var(self):
        env_var = 'tbb omp workqueue notvalidhere'
        with self.assertRaises(AssertionError) as raises:
            self.each_env_var(env_var)
        for msg in (
            "THREADING_LAYER_PRIORITY invalid:",
            "It must be a permutation of"
        ):
            self.assertIn(f"{msg}", str(raises.exception))

    @skip_no_omp
    def test_omp(self):
        for env_var in ("omp tbb workqueue", "omp workqueue tbb"):
            self.each_env_var(env_var)

    @skip_no_tbb
    def test_tbb(self):
        for env_var in ("tbb omp workqueue", "tbb workqueue omp"):
            self.each_env_var(env_var)

    def test_workqueue(self):
        for env_var in ("workqueue tbb omp", "workqueue omp tbb"):
            self.each_env_var(env_var)


@skip_parfors_unsupported
class TestMiscBackendIssues(ThreadLayerTestHelper):
    """
    Checks fixes for the issues with threading backends implementation
    """
    _DEBUG = False

    @skip_no_omp
    def test_omp_stack_overflow(self):
        """
        Tests that OMP does not overflow stack
        """
        runme = """if 1:
            from numba import vectorize, threading_layer
            import numpy as np

            @vectorize(['f4(f4,f4,f4,f4,f4,f4,f4,f4)'], target='parallel')
            def foo(a, b, c, d, e, f, g, h):
                return a+b+c+d+e+f+g+h

            x = np.ones(2**20, np.float32)
            foo(*([x]*8))
            assert threading_layer() == "omp", "omp not found"
        """
        cmdline = [sys.executable, '-c', runme]
        env = os.environ.copy()
        env['NUMBA_THREADING_LAYER'] = "omp"
        env['OMP_STACKSIZE'] = "100K"
        self.run_cmd(cmdline, env=env)

    @skip_no_tbb
    def test_single_thread_tbb(self):
        """
        Tests that TBB works well with single thread
        https://github.com/numba/numba/issues/3440
        """
        runme = """if 1:
            from numba import njit, prange, threading_layer

            @njit(parallel=True)
            def foo(n):
                acc = 0
                for i in prange(n):
                    acc += i
                return acc

            foo(100)
            assert threading_layer() == "tbb", "tbb not found"
        """
        cmdline = [sys.executable, '-c', runme]
        env = os.environ.copy()
        env['NUMBA_THREADING_LAYER'] = "tbb"
        env['NUMBA_NUM_THREADS'] = "1"
        self.run_cmd(cmdline, env=env)

    def test_workqueue_aborts_on_nested_parallelism(self):
        """
        Tests workqueue raises sigabrt if a nested parallel call is performed
        """
        runme = """if 1:
            from numba import njit, prange
            import numpy as np

            @njit(parallel=True)
            def nested(x):
                for i in prange(len(x)):
                    x[i] += 1


            @njit(parallel=True)
            def main():
                Z = np.zeros((5, 10))
                for i in prange(Z.shape[0]):
                    nested(Z[i])
                return Z

            main()
        """
        cmdline = [sys.executable, '-c', runme]
        env = os.environ.copy()
        env['NUMBA_THREADING_LAYER'] = "workqueue"
        env['NUMBA_NUM_THREADS'] = "4"

        try:
            out, err = self.run_cmd(cmdline, env=env)
        except AssertionError as e:
            if self._DEBUG:
                print(out, err)
            e_msg = str(e)
            self.assertIn("failed with code", e_msg)
            # raised a SIGABRT, but the value is platform specific so just check
            # the error message
            expected = ("Numba workqueue threading layer is terminating: "
                        "Concurrent access has been detected.")
            self.assertIn(expected, e_msg)

    @unittest.skipUnless(_HAVE_OS_FORK, "Test needs fork(2)")
    def test_workqueue_handles_fork_from_non_main_thread(self):
        # For context see #7872, but essentially the multiprocessing pool
        # implementation has a number of Python threads for handling the worker
        # processes, one of which calls fork(2), this results in a fork from a
        # non-main thread.

        runme = """if 1:
            from numba import njit, prange, threading_layer
            import numpy as np
            import multiprocessing

            if __name__ == "__main__":
                # Need for force fork context (OSX default is "spawn")
                multiprocessing.set_start_method('fork')

                @njit(parallel=True)
                def func(x):
                    return 10. * x

                arr = np.arange(2.)

                # run in single process to start Numba's thread pool
                np.testing.assert_allclose(func(arr), func.py_func(arr))

                # now run in a multiprocessing pool to get a fork from a
                # non-main thread
                with multiprocessing.Pool(10) as p:
                    result = p.map(func, [arr])
                np.testing.assert_allclose(result,
                                           func.py_func(np.expand_dims(arr, 0)))

                assert threading_layer() == "workqueue"
        """
        cmdline = [sys.executable, '-c', runme]
        env = os.environ.copy()
        env['NUMBA_THREADING_LAYER'] = "workqueue"
        env['NUMBA_NUM_THREADS'] = "4"

        self.run_cmd(cmdline, env=env)


# 32bit or windows py27 (not that this runs on windows)
@skip_parfors_unsupported
@skip_unless_gnu_omp
class TestForkSafetyIssues(ThreadLayerTestHelper):
    """
    Checks Numba's behaviour in various situations involving GNU OpenMP and fork
    """
    _DEBUG = False

    def test_check_threading_layer_is_gnu(self):
        runme = """if 1:
            from numba.np.ufunc import omppool
            assert omppool.openmp_vendor == 'GNU'
            """
        cmdline = [sys.executable, '-c', runme]
        out, err = self.run_cmd(cmdline)

    def test_par_parent_os_fork_par_child(self):
        """
        Whilst normally valid, this actually isn't for Numba invariant of OpenMP
        Checks SIGABRT is received.
        """
        body = """if 1:
            X = np.arange(1000000.)
            Y = np.arange(1000000.)
            Z = busy_func(X, Y)
            pid = os.fork()
            if pid  == 0:
                Z = busy_func(X, Y)
            else:
                os.wait()
        """
        runme = self.template % body
        cmdline = [sys.executable, '-c', runme]
        try:
            out, err = self.run_cmd(cmdline)
        except AssertionError as e:
            self.assertIn("failed with code -6", str(e))

    def test_par_parent_implicit_mp_fork_par_child(self):
        """
        Implicit use of multiprocessing fork context.
        Does this:
        1. Start with OpenMP
        2. Fork to processes using OpenMP (this is invalid)
        3. Joins fork
        4. Check the exception pushed onto the queue that is a result of
           catching SIGTERM coming from the C++ aborting on illegal fork
           pattern for GNU OpenMP
        """
        body = """if 1:
            mp = multiprocessing.get_context('fork')
            X = np.arange(1000000.)
            Y = np.arange(1000000.)
            q = mp.Queue()

            # Start OpenMP runtime on parent via parallel function
            Z = busy_func(X, Y, q)

            # fork() underneath with no exec, will abort
            proc = mp.Process(target = busy_func, args=(X, Y, q))
            proc.start()

            err = q.get()
            assert "Caught SIGTERM" in str(err)
        """
        runme = self.template % body
        cmdline = [sys.executable, '-c', runme]
        out, err = self.run_cmd(cmdline)
        if self._DEBUG:
            print(out, err)

    @linux_only
    def test_par_parent_explicit_mp_fork_par_child(self):
        """
        Explicit use of multiprocessing fork context.
        Does this:
        1. Start with OpenMP
        2. Fork to processes using OpenMP (this is invalid)
        3. Joins fork
        4. Check the exception pushed onto the queue that is a result of
           catching SIGTERM coming from the C++ aborting on illegal fork
           pattern for GNU OpenMP
        """
        body = """if 1:
            X = np.arange(1000000.)
            Y = np.arange(1000000.)
            ctx = multiprocessing.get_context('fork')
            q = ctx.Queue()

            # Start OpenMP runtime on parent via parallel function
            Z = busy_func(X, Y, q)

            # fork() underneath with no exec, will abort
            proc = ctx.Process(target = busy_func, args=(X, Y, q))
            proc.start()
            proc.join()

            err = q.get()
            assert "Caught SIGTERM" in str(err)
        """
        runme = self.template % body
        cmdline = [sys.executable, '-c', runme]
        out, err = self.run_cmd(cmdline)
        if self._DEBUG:
            print(out, err)

    def test_par_parent_mp_spawn_par_child_par_parent(self):
        """
        Explicit use of multiprocessing spawn, this is safe.
        Does this:
        1. Start with OpenMP
        2. Spawn to processes using OpenMP
        3. Join spawns
        4. Run some more OpenMP
        """
        body = """if 1:
            X = np.arange(1000000.)
            Y = np.arange(1000000.)
            ctx = multiprocessing.get_context('spawn')
            q = ctx.Queue()

            # Start OpenMP runtime and run on parent via parallel function
            Z = busy_func(X, Y, q)
            procs = []
            for x in range(20): # start a lot to try and get overlap
                ## fork() + exec() to run some OpenMP on children
                proc = ctx.Process(target = busy_func, args=(X, Y, q))
                procs.append(proc)
                sys.stdout.flush()
                sys.stderr.flush()
                proc.start()

            [p.join() for p in procs]

            try:
                q.get(False)
            except multiprocessing.queues.Empty:
                pass
            else:
                raise RuntimeError("Queue was not empty")

            # Run some more OpenMP on parent
            Z = busy_func(X, Y, q)
        """
        runme = self.template % body
        cmdline = [sys.executable, '-c', runme]
        out, err = self.run_cmd(cmdline)
        if self._DEBUG:
            print(out, err)

    def test_serial_parent_implicit_mp_fork_par_child_then_par_parent(self):
        """
        Implicit use of multiprocessing (will be fork, but cannot declare that
        in Py2.7 as there's no process launch context).
        Does this:
        1. Start with no OpenMP
        2. Fork to processes using OpenMP
        3. Join forks
        4. Run some OpenMP
        """
        body = """if 1:
            X = np.arange(1000000.)
            Y = np.arange(1000000.)
            q = multiprocessing.Queue()

            # this is ok
            procs = []
            for x in range(10):
                # fork() underneath with but no OpenMP in parent, this is ok
                proc = multiprocessing.Process(target = busy_func,
                                               args=(X, Y, q))
                procs.append(proc)
                proc.start()

            [p.join() for p in procs]

            # and this is still ok as the OpenMP happened in forks
            Z = busy_func(X, Y, q)
            try:
                q.get(False)
            except multiprocessing.queues.Empty:
                pass
            else:
                raise RuntimeError("Queue was not empty")
        """
        runme = self.template % body
        cmdline = [sys.executable, '-c', runme]
        out, err = self.run_cmd(cmdline)
        if self._DEBUG:
            print(out, err)

    @linux_only
    def test_serial_parent_explicit_mp_fork_par_child_then_par_parent(self):
        """
        Explicit use of multiprocessing 'fork'.
        Does this:
        1. Start with no OpenMP
        2. Fork to processes using OpenMP
        3. Join forks
        4. Run some OpenMP
        """
        body = """if 1:
            X = np.arange(1000000.)
            Y = np.arange(1000000.)
            ctx = multiprocessing.get_context('fork')
            q = ctx.Queue()

            # this is ok
            procs = []
            for x in range(10):
                # fork() underneath with but no OpenMP in parent, this is ok
                proc = ctx.Process(target = busy_func, args=(X, Y, q))
                procs.append(proc)
                proc.start()

            [p.join() for p in procs]

            # and this is still ok as the OpenMP happened in forks
            Z = busy_func(X, Y, q)
            try:
                q.get(False)
            except multiprocessing.queues.Empty:
                pass
            else:
                raise RuntimeError("Queue was not empty")
        """
        runme = self.template % body
        cmdline = [sys.executable, '-c', runme]
        out, err = self.run_cmd(cmdline)
        if self._DEBUG:
            print(out, err)


@skip_parfors_unsupported
@skip_no_tbb
class TestTBBSpecificIssues(ThreadLayerTestHelper):

    _DEBUG = False

    @linux_only # os.fork required.
    def test_fork_from_non_main_thread(self):
        # See issue #5973 and PR #6208 for original context.
        # See issue #6963 for context on the following comments:
        #
        # Important things to note:
        # 1. Compilation of code containing an objmode block will result in the
        #    use of and `ObjModeLiftedWith` as the dispatcher. This inherits
        #    from `LiftedCode` which handles the serialization. In that
        #    serialization is a call to uuid.uuid1() which causes a fork_exec in
        #    CPython internals.
        # 2. The selected parallel backend thread pool is started during the
        #    compilation of a function that has `parallel=True`.
        # 3. The TBB backend can handle forks from the main thread, it will
        #    safely reinitialise after so doing. If a fork occurs from a
        #    non-main thread it will warn and the state is invalid in the child
        #    process.
        #
        # Due to 1. and 2. the `obj_mode_func` function separated out and is
        # `njit` decorated. This means during type inference of `work` it will
        # trigger a standard compilation of the function and the thread pools
        # won't have started yet as the parallelisation compiler passes for
        # `work` won't yet have run. This mitigates the fork() call from 1.
        # occurring after 2. The result of this is that 3. can be tested using
        # the threading etc herein with the state being known as the above
        # described, i.e. the TBB threading layer has not experienced a fork().

        runme = """if 1:
            import threading
            import numba
            numba.config.THREADING_LAYER='tbb'
            from numba import njit, prange, objmode
            from numba.core.serialize import PickleCallableByPath
            import os

            e_running = threading.Event()
            e_proceed = threading.Event()

            def indirect_core():
                e_running.set()
                # wait for forker() to have forked
                while not e_proceed.isSet():
                    pass

            indirect = PickleCallableByPath(indirect_core)

            @njit
            def obj_mode_func():
                with objmode():
                    indirect()

            @njit(parallel=True, nogil=True)
            def work():
                acc = 0
                for x in prange(10):
                    acc += x
                obj_mode_func()
                return acc

            def runner():
                work()

            def forker():
                # wait for the jit function to say it's running
                while not e_running.isSet():
                    pass
                # then fork
                os.fork()
                # now fork is done signal the runner to proceed to exit
                e_proceed.set()

            numba_runner = threading.Thread(target=runner,)
            fork_runner =  threading.Thread(target=forker,)

            threads = (numba_runner, fork_runner)
            for t in threads:
                t.start()
            for t in threads:
                t.join()
        """

        cmdline = [sys.executable, '-c', runme]
        out, err = self.run_cmd(cmdline)
        # assert error message printed on stderr
        msg_head = "Attempted to fork from a non-main thread, the TBB library"
        self.assertIn(msg_head, err)

        if self._DEBUG:
            print("OUT:", out)
            print("ERR:", err)

    @linux_only # fork required.
    def test_lifetime_of_task_scheduler_handle(self):

        self.skip_if_no_external_compiler() # external compiler needed

        # See PR #7280 for context.
        BROKEN_COMPILERS = 'SKIP: COMPILATION FAILED'
        runme = """if 1:
            import ctypes
            import sys
            import multiprocessing as mp
            from tempfile import TemporaryDirectory, NamedTemporaryFile
            from numba.pycc.platform import Toolchain, external_compiler_works
            from numba import njit, prange, threading_layer
            import faulthandler
            faulthandler.enable()
            if not external_compiler_works():
                raise AssertionError('External compilers are not found.')
            with TemporaryDirectory() as tmpdir:
                with NamedTemporaryFile(dir=tmpdir) as tmpfile:
                    try:
                        src = \"\"\"
                        #define TBB_PREVIEW_WAITING_FOR_WORKERS 1
                        #include <tbb/tbb.h>
                        static tbb::task_scheduler_handle tsh;
                        extern "C"
                        {
                        void launch(void)
                        {
                            tsh = tbb::task_scheduler_handle::get();
                        }
                        }
                        \"\"\"
                        cxxfile = f"{tmpfile.name}.cxx"
                        with open(cxxfile, 'wt') as f:
                            f.write(src)
                        tc = Toolchain()
                        object_files = tc.compile_objects([cxxfile,],
                                                           output_dir=tmpdir)
                        dso_name = f"{tmpfile.name}.so"
                        tc.link_shared(dso_name, object_files,
                                       libraries=['tbb',],
                                       export_symbols=['launch'])
                        # Load into the process, it doesn't matter whether the
                        # DSO exists on disk once it's loaded in.
                        DLL = ctypes.CDLL(dso_name)
                    except Exception as e:
                        # Something is broken in compilation, could be one of
                        # many things including, but not limited to: missing tbb
                        # headers, incorrect permissions, compilers that don't
                        # work for the above
                        print(e)
                        print('BROKEN_COMPILERS')
                        sys.exit(0)

                    # Do the test, launch this library and also execute a
                    # function with the TBB threading layer.

                    DLL.launch()

                    @njit(parallel=True)
                    def foo(n):
                        acc = 0
                        for i in prange(n):
                            acc += i
                        return acc

                    foo(1)

            # Check the threading layer used was TBB
            assert threading_layer() == 'tbb'

            # Use mp context for a controlled version of fork, this triggers the
            # reported bug.

            ctx = mp.get_context('fork')
            def nowork():
                pass
            p = ctx.Process(target=nowork)
            p.start()
            p.join(10)
            print("SUCCESS")
            """.replace('BROKEN_COMPILERS', BROKEN_COMPILERS)

        cmdline = [sys.executable, '-c', runme]
        env = os.environ.copy()
        env['NUMBA_THREADING_LAYER'] = 'tbb'
        out, err = self.run_cmd(cmdline, env=env)

        if BROKEN_COMPILERS in out:
            self.skipTest("Compilation of DSO failed. Check output for details")
        else:
            self.assertIn("SUCCESS", out)

        if self._DEBUG:
            print("OUT:", out)
            print("ERR:", err)


@skip_parfors_unsupported
class TestInitSafetyIssues(TestCase):

    _DEBUG = False

    def run_cmd(self, cmdline):
        popen = subprocess.Popen(cmdline,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,)
        # finish in _TEST_TIMEOUT seconds or kill it
        timeout = threading.Timer(_TEST_TIMEOUT, popen.kill)
        try:
            timeout.start()
            out, err = popen.communicate()
            if popen.returncode != 0:
                raise AssertionError(
                    "process failed with code %s: stderr follows\n%s\n" %
                    (popen.returncode, err.decode()))
        finally:
            timeout.cancel()
        return out.decode(), err.decode()

    @linux_only # only linux can leak semaphores
    def test_orphaned_semaphore(self):
        # sys path injection and separate usecase module to make sure everything
        # is importable by children of multiprocessing

        test_file = os.path.join(os.path.dirname(__file__),
                                 "orphaned_semaphore_usecase.py")
        cmdline = [sys.executable, test_file]
        out, err = self.run_cmd(cmdline)

        # assert no semaphore leaks reported on stderr
        self.assertNotIn("leaked semaphore", err)

        if self._DEBUG:
            print("OUT:", out)
            print("ERR:", err)

    def test_lazy_lock_init(self):
        # checks based on https://github.com/numba/numba/pull/5724
        # looking for "lazy" process lock initialisation so as to avoid setting
        # a multiprocessing context as part of import.
        for meth in ('fork', 'spawn', 'forkserver'):
            # if a context is available on the host check it can be set as the
            # start method in a separate process
            try:
                multiprocessing.get_context(meth)
            except ValueError:
                continue
            cmd = ("import numba; import multiprocessing;"
                   "multiprocessing.set_start_method('{}');"
                   "print(multiprocessing.get_context().get_start_method())")
            cmdline = [sys.executable, "-c", cmd.format(meth)]
            out, err = self.run_cmd(cmdline)
            if self._DEBUG:
                print("OUT:", out)
                print("ERR:", err)
            self.assertIn(meth, out)


@skip_parfors_unsupported
@skip_no_omp
class TestOpenMPVendors(TestCase):

    def test_vendors(self):
        """
        Checks the OpenMP vendor strings are correct
        """
        expected = dict()
        expected['win32'] = "MS"
        expected['darwin'] = "Intel"
        expected['linux'] = "GNU"

        # only check OS that are supported, custom toolchains may well work as
        # may other OS
        for k in expected.keys():
            if sys.platform.startswith(k):
                self.assertEqual(expected[k], omppool.openmp_vendor)


if __name__ == '__main__':
    unittest.main()