timsort.py

"""
Timsort implementation.  Mostly adapted from CPython's listobject.c.

For more information, see listsort.txt in CPython's source tree.
"""


import collections

from numba.core import types


TimsortImplementation = collections.namedtuple(
    'TimsortImplementation',
    (# The compile function itself
     'compile',
     # All subroutines exercised by test_sort
     'count_run', 'binarysort', 'gallop_left', 'gallop_right',
     'merge_init', 'merge_append', 'merge_pop',
     'merge_compute_minrun', 'merge_lo', 'merge_hi', 'merge_at',
     'merge_force_collapse', 'merge_collapse',
     # The top-level functions
     'run_timsort', 'run_timsort_with_values'
     ))


# The maximum number of entries in a MergeState's pending-runs stack.
# This is enough to sort arrays of size up to about
#    32 * phi ** MAX_MERGE_PENDING
# where phi ~= 1.618.  85 is ridiculously large enough, good for an array
# with 2**64 elements.
# NOTE this implementation doesn't depend on it (the stack is dynamically
# allocated), but it's still good to check as an invariant.
MAX_MERGE_PENDING  = 85

# When we get into galloping mode, we stay there until both runs win less
# often than MIN_GALLOP consecutive times.  See listsort.txt for more info.
MIN_GALLOP = 7

# Start size for temp arrays.
MERGESTATE_TEMP_SIZE = 256

# A mergestate is a named tuple with the following members:
#  - *min_gallop* is an integer controlling when we get into galloping mode
#  - *keys* is a temp list for merging keys
#  - *values* is a temp list for merging values, if needed
#  - *pending* is a stack of pending runs to be merged
#  - *n* is the current stack length of *pending*

MergeState = collections.namedtuple(
    'MergeState', ('min_gallop', 'keys', 'values', 'pending', 'n'))


MergeRun = collections.namedtuple('MergeRun', ('start', 'size'))


def make_timsort_impl(wrap, make_temp_area):

    make_temp_area = wrap(make_temp_area)
    intp = types.intp
    zero = intp(0)

    @wrap
    def has_values(keys, values):
        return values is not keys

    @wrap
    def merge_init(keys):
        """
        Initialize a MergeState for a non-keyed sort.
        """
        temp_size = min(len(keys) // 2 + 1, MERGESTATE_TEMP_SIZE)
        temp_keys = make_temp_area(keys, temp_size)
        temp_values = temp_keys
        pending = [MergeRun(zero, zero)] * MAX_MERGE_PENDING
        return MergeState(intp(MIN_GALLOP), temp_keys, temp_values, pending, zero)

    @wrap
    def merge_init_with_values(keys, values):
        """
        Initialize a MergeState for a keyed sort.
        """
        temp_size = min(len(keys) // 2 + 1, MERGESTATE_TEMP_SIZE)
        temp_keys = make_temp_area(keys, temp_size)
        temp_values = make_temp_area(values, temp_size)
        pending = [MergeRun(zero, zero)] * MAX_MERGE_PENDING
        return MergeState(intp(MIN_GALLOP), temp_keys, temp_values, pending, zero)

    @wrap
    def merge_append(ms, run):
        """
        Append a run on the merge stack.
        """
        n = ms.n
        assert n < MAX_MERGE_PENDING
        ms.pending[n] = run
        return MergeState(ms.min_gallop, ms.keys, ms.values, ms.pending, n + 1)

    @wrap
    def merge_pop(ms):
        """
        Pop the top run from the merge stack.
        """
        return MergeState(ms.min_gallop, ms.keys, ms.values, ms.pending, ms.n - 1)

    @wrap
    def merge_getmem(ms, need):
        """
        Ensure enough temp memory for 'need' items is available.
        """
        alloced = len(ms.keys)
        if need <= alloced:
            return ms
        # Over-allocate
        while alloced < need:
            alloced = alloced << 1
        # Don't realloc!  That can cost cycles to copy the old data, but
        # we don't care what's in the block.
        temp_keys = make_temp_area(ms.keys, alloced)
        if has_values(ms.keys, ms.values):
            temp_values = make_temp_area(ms.values, alloced)
        else:
            temp_values = temp_keys
        return MergeState(ms.min_gallop, temp_keys, temp_values, ms.pending, ms.n)

    @wrap
    def merge_adjust_gallop(ms, new_gallop):
        """
        Modify the MergeState's min_gallop.
        """
        return MergeState(intp(new_gallop), ms.keys, ms.values, ms.pending, ms.n)


    @wrap
    def LT(a, b):
        """
        Trivial comparison function between two keys.  This is factored out to
        make it clear where comparisons occur.
        """
        return a < b

    @wrap
    def binarysort(keys, values, lo, hi, start):
        """
        binarysort is the best method for sorting small arrays: it does
        few compares, but can do data movement quadratic in the number of
        elements.
        [lo, hi) is a contiguous slice of a list, and is sorted via
        binary insertion.  This sort is stable.
        On entry, must have lo <= start <= hi, and that [lo, start) is already
        sorted (pass start == lo if you don't know!).
        """
        assert lo <= start and start <= hi
        _has_values = has_values(keys, values)
        if lo == start:
            start += 1
        while start < hi:
            pivot = keys[start]
            # Bisect to find where to insert `pivot`
            # NOTE: bisection only wins over linear search if the comparison
            # function is much more expensive than simply moving data.
            l = lo
            r = start
            # Invariants:
            # pivot >= all in [lo, l).
            # pivot  < all in [r, start).
            # The second is vacuously true at the start.
            while l < r:
                p = l + ((r - l) >> 1)
                if LT(pivot, keys[p]):
                    r = p
                else:
                    l = p+1

            # The invariants still hold, so pivot >= all in [lo, l) and
            # pivot < all in [l, start), so pivot belongs at l.  Note
            # that if there are elements equal to pivot, l points to the
            # first slot after them -- that's why this sort is stable.
            # Slide over to make room (aka memmove()).
            for p in range(start, l, -1):
                keys[p] = keys[p - 1]
            keys[l] = pivot
            if _has_values:
                pivot_val = values[start]
                for p in range(start, l, -1):
                    values[p] = values[p - 1]
                values[l] = pivot_val

            start += 1


    @wrap
    def count_run(keys, lo, hi):
        """
        Return the length of the run beginning at lo, in the slice [lo, hi).
        lo < hi is required on entry.  "A run" is the longest ascending sequence, with

            lo[0] <= lo[1] <= lo[2] <= ...

        or the longest descending sequence, with

            lo[0] > lo[1] > lo[2] > ...

        A tuple (length, descending) is returned, where boolean *descending*
        is set to 0 in the former case, or to 1 in the latter.
        For its intended use in a stable mergesort, the strictness of the defn of
        "descending" is needed so that the caller can safely reverse a descending
        sequence without violating stability (strict > ensures there are no equal
        elements to get out of order).
        """
        assert lo < hi
        if lo + 1 == hi:
            # Trivial 1-long run
            return 1, False
        if LT(keys[lo + 1], keys[lo]):
            # Descending run
            for k in range(lo + 2, hi):
                if not LT(keys[k], keys[k - 1]):
                    return k - lo, True
            return hi - lo, True
        else:
            # Ascending run
            for k in range(lo + 2, hi):
                if LT(keys[k], keys[k - 1]):
                    return k - lo, False
            return hi - lo, False


    @wrap
    def gallop_left(key, a, start, stop, hint):
        """
        Locate the proper position of key in a sorted vector; if the vector contains
        an element equal to key, return the position immediately to the left of
        the leftmost equal element.  [gallop_right() does the same except returns
        the position to the right of the rightmost equal element (if any).]

        "a" is a sorted vector with stop elements, starting at a[start].
        stop must be > start.

        "hint" is an index at which to begin the search, start <= hint < stop.
        The closer hint is to the final result, the faster this runs.

        The return value is the int k in start..stop such that

            a[k-1] < key <= a[k]

        pretending that a[start-1] is minus infinity and a[stop] is plus infinity.
        IOW, key belongs at index k; or, IOW, the first k elements of a should
        precede key, and the last stop-start-k should follow key.

        See listsort.txt for info on the method.
        """
        assert stop > start
        assert hint >= start and hint < stop
        n = stop - start

        # First, gallop from the hint to find a "good" subinterval for bisecting
        lastofs = 0
        ofs = 1
        if LT(a[hint], key):
            # a[hint] < key => gallop right, until
            #                  a[hint + lastofs] < key <= a[hint + ofs]
            maxofs = stop - hint
            while ofs < maxofs:
                if LT(a[hint + ofs], key):
                    lastofs = ofs
                    ofs = (ofs << 1) + 1
                    if ofs <= 0:
                        # Int overflow
                        ofs = maxofs
                else:
                    # key <= a[hint + ofs]
                    break
            if ofs > maxofs:
                ofs = maxofs
            # Translate back to offsets relative to a[0]
            lastofs += hint
            ofs += hint
        else:
            # key <= a[hint] => gallop left, until
            #                   a[hint - ofs] < key <= a[hint - lastofs]
            maxofs = hint - start + 1
            while ofs < maxofs:
                if LT(a[hint - ofs], key):
                    break
                else:
                    # key <= a[hint - ofs]
                    lastofs = ofs
                    ofs = (ofs << 1) + 1
                    if ofs <= 0:
                        # Int overflow
                        ofs = maxofs
            if ofs > maxofs:
                ofs = maxofs
            # Translate back to positive offsets relative to a[0]
            lastofs, ofs = hint - ofs, hint - lastofs

        assert start - 1 <= lastofs and lastofs < ofs and ofs <= stop
        # Now a[lastofs] < key <= a[ofs], so key belongs somewhere to the
        # right of lastofs but no farther right than ofs.  Do a binary
        # search, with invariant a[lastofs-1] < key <= a[ofs].
        lastofs += 1
        while lastofs < ofs:
            m = lastofs + ((ofs - lastofs) >> 1)
            if LT(a[m], key):
                # a[m] < key
                lastofs = m + 1
            else:
                # key <= a[m]
                ofs = m
        # Now lastofs == ofs, so a[ofs - 1] < key <= a[ofs]
        return ofs


    @wrap
    def gallop_right(key, a, start, stop, hint):
        """
        Exactly like gallop_left(), except that if key already exists in a[start:stop],
        finds the position immediately to the right of the rightmost equal value.

        The return value is the int k in start..stop such that

            a[k-1] <= key < a[k]

        The code duplication is massive, but this is enough different given that
        we're sticking to "<" comparisons that it's much harder to follow if
        written as one routine with yet another "left or right?" flag.
        """
        assert stop > start
        assert hint >= start and hint < stop
        n = stop - start

        # First, gallop from the hint to find a "good" subinterval for bisecting
        lastofs = 0
        ofs = 1
        if LT(key, a[hint]):
            # key < a[hint] => gallop left, until
            #                  a[hint - ofs] <= key < a[hint - lastofs]
            maxofs = hint - start + 1
            while ofs < maxofs:
                if LT(key, a[hint - ofs]):
                    lastofs = ofs
                    ofs = (ofs << 1) + 1
                    if ofs <= 0:
                        # Int overflow
                        ofs = maxofs
                else:
                    # a[hint - ofs] <= key
                    break
            if ofs > maxofs:
                ofs = maxofs
            # Translate back to positive offsets relative to a[0]
            lastofs, ofs = hint - ofs, hint - lastofs
        else:
            # a[hint] <= key -- gallop right, until
            # a[hint + lastofs] <= key < a[hint + ofs]
            maxofs = stop - hint
            while ofs < maxofs:
                if LT(key, a[hint + ofs]):
                    break
                else:
                    # a[hint + ofs] <= key
                    lastofs = ofs
                    ofs = (ofs << 1) + 1
                    if ofs <= 0:
                        # Int overflow
                        ofs = maxofs
            if ofs > maxofs:
                ofs = maxofs
            # Translate back to offsets relative to a[0]
            lastofs += hint
            ofs += hint

        assert start - 1 <= lastofs and lastofs < ofs and ofs <= stop
        # Now a[lastofs] <= key < a[ofs], so key belongs somewhere to the
        # right of lastofs but no farther right than ofs.  Do a binary
        # search, with invariant a[lastofs-1] <= key < a[ofs].
        lastofs += 1
        while lastofs < ofs:
            m = lastofs + ((ofs - lastofs) >> 1)
            if LT(key, a[m]):
                # key < a[m]
                ofs = m
            else:
                # a[m] <= key
                lastofs = m + 1
        # Now lastofs == ofs, so a[ofs - 1] <= key < a[ofs]
        return ofs


    @wrap
    def merge_compute_minrun(n):
        """
        Compute a good value for the minimum run length; natural runs shorter
        than this are boosted artificially via binary insertion.

        If n < 64, return n (it's too small to bother with fancy stuff).
        Else if n is an exact power of 2, return 32.
        Else return an int k, 32 <= k <= 64, such that n/k is close to, but
        strictly less than, an exact power of 2.

        See listsort.txt for more info.
        """
        r = 0
        assert n >= 0
        while n >= 64:
            r |= n & 1
            n >>= 1
        return n + r


    @wrap
    def sortslice_copy(dest_keys, dest_values, dest_start,
                       src_keys, src_values, src_start,
                       nitems):
        """
        Upwards memcpy().
        """
        assert src_start >= 0
        assert dest_start >= 0
        for i in range(nitems):
            dest_keys[dest_start + i] = src_keys[src_start + i]
        if has_values(src_keys, src_values):
            for i in range(nitems):
                dest_values[dest_start + i] = src_values[src_start + i]

    @wrap
    def sortslice_copy_down(dest_keys, dest_values, dest_start,
                            src_keys, src_values, src_start,
                            nitems):
        """
        Downwards memcpy().
        """
        assert src_start >= 0
        assert dest_start >= 0
        for i in range(nitems):
            dest_keys[dest_start - i] = src_keys[src_start - i]
        if has_values(src_keys, src_values):
            for i in range(nitems):
                dest_values[dest_start - i] = src_values[src_start - i]


    # Disable this for debug or perf comparison
    DO_GALLOP = 1

    @wrap
    def merge_lo(ms, keys, values, ssa, na, ssb, nb):
        """
        Merge the na elements starting at ssa with the nb elements starting at
        ssb = ssa + na in a stable way, in-place.  na and nb must be > 0,
        and should have na <= nb. See listsort.txt for more info.

        An updated MergeState is returned (with possibly a different min_gallop
        or larger temp arrays).

        NOTE: compared to CPython's timsort, the requirement that
            "Must also have that keys[ssa + na - 1] belongs at the end of the merge"

        is removed. This makes the code a bit simpler and easier to reason about.
        """
        assert na > 0 and nb > 0 and na <= nb
        assert ssb == ssa + na
        # First copy [ssa, ssa + na) into the temp space
        ms = merge_getmem(ms, na)
        sortslice_copy(ms.keys, ms.values, 0,
                       keys, values, ssa,
                       na)
        a_keys = ms.keys
        a_values = ms.values
        b_keys = keys
        b_values = values
        dest = ssa
        ssa = 0

        _has_values = has_values(a_keys, a_values)
        min_gallop = ms.min_gallop

        # Now start merging into the space left from [ssa, ...)

        while nb > 0 and na > 0:
            # Do the straightforward thing until (if ever) one run
            # appears to win consistently.
            acount = 0
            bcount = 0

            while True:
                if LT(b_keys[ssb], a_keys[ssa]):
                    keys[dest] = b_keys[ssb]
                    if _has_values:
                        values[dest] = b_values[ssb]
                    dest += 1
                    ssb += 1
                    nb -= 1
                    if nb == 0:
                        break
                    # It's a B run
                    bcount += 1
                    acount = 0
                    if bcount >= min_gallop:
                        break
                else:
                    keys[dest] = a_keys[ssa]
                    if _has_values:
                        values[dest] = a_values[ssa]
                    dest += 1
                    ssa += 1
                    na -= 1
                    if na == 0:
                        break
                    # It's a A run
                    acount += 1
                    bcount = 0
                    if acount >= min_gallop:
                        break

            # One run is winning so consistently that galloping may
            # be a huge win.  So try that, and continue galloping until
            # (if ever) neither run appears to be winning consistently
            # anymore.
            if DO_GALLOP and na > 0 and nb > 0:
                min_gallop += 1

                while acount >= MIN_GALLOP or bcount >= MIN_GALLOP:
                    # As long as we gallop without leaving this loop, make
                    # the heuristic more likely
                    min_gallop -= min_gallop > 1

                    # Gallop in A to find where keys[ssb] should end up
                    k = gallop_right(b_keys[ssb], a_keys, ssa, ssa + na, ssa)
                    # k is an index, make it a size
                    k -= ssa
                    acount = k
                    if k > 0:
                        # Copy everything from A before k
                        sortslice_copy(keys, values, dest,
                                       a_keys, a_values, ssa,
                                       k)
                        dest += k
                        ssa += k
                        na -= k
                        if na == 0:
                            # Finished merging
                            break
                    # Copy keys[ssb]
                    keys[dest] = b_keys[ssb]
                    if _has_values:
                        values[dest] = b_values[ssb]
                    dest += 1
                    ssb += 1
                    nb -= 1
                    if nb == 0:
                        # Finished merging
                        break

                    # Gallop in B to find where keys[ssa] should end up
                    k = gallop_left(a_keys[ssa], b_keys, ssb, ssb + nb, ssb)
                    # k is an index, make it a size
                    k -= ssb
                    bcount = k
                    if k > 0:
                        # Copy everything from B before k
                        # NOTE: source and dest are the same buffer, but the
                        # destination index is below the source index
                        sortslice_copy(keys, values, dest,
                                       b_keys, b_values, ssb,
                                       k)
                        dest += k
                        ssb += k
                        nb -= k
                        if nb == 0:
                            # Finished merging
                            break
                    # Copy keys[ssa]
                    keys[dest] = a_keys[ssa]
                    if _has_values:
                        values[dest] = a_values[ssa]
                    dest += 1
                    ssa += 1
                    na -= 1
                    if na == 0:
                        # Finished merging
                        break

                # Penalize it for leaving galloping mode
                min_gallop += 1

        # Merge finished, now handle the remaining areas
        if nb == 0:
            # Only A remaining to copy at the end of the destination area
            sortslice_copy(keys, values, dest,
                           a_keys, a_values, ssa,
                           na)
        else:
            assert na == 0
            assert dest == ssb
            # B's tail is already at the right place, do nothing

        return merge_adjust_gallop(ms, min_gallop)


    @wrap
    def merge_hi(ms, keys, values, ssa, na, ssb, nb):
        """
        Merge the na elements starting at ssa with the nb elements starting at
        ssb = ssa + na in a stable way, in-place.  na and nb must be > 0,
        and should have na >= nb.  See listsort.txt for more info.

        An updated MergeState is returned (with possibly a different min_gallop
        or larger temp arrays).

        NOTE: compared to CPython's timsort, the requirement that
            "Must also have that keys[ssa + na - 1] belongs at the end of the merge"

        is removed. This makes the code a bit simpler and easier to reason about.
        """
        assert na > 0 and nb > 0 and na >= nb
        assert ssb == ssa + na
        # First copy [ssb, ssb + nb) into the temp space
        ms = merge_getmem(ms, nb)
        sortslice_copy(ms.keys, ms.values, 0,
                       keys, values, ssb,
                       nb)
        a_keys = keys
        a_values = values
        b_keys = ms.keys
        b_values = ms.values

        # Now start merging *in descending order* into the space left
        # from [..., ssb + nb).
        dest = ssb + nb - 1
        ssb = nb - 1
        ssa = ssa + na - 1

        _has_values = has_values(b_keys, b_values)
        min_gallop = ms.min_gallop

        while nb > 0 and na > 0:
            # Do the straightforward thing until (if ever) one run
            # appears to win consistently.
            acount = 0
            bcount = 0

            while True:
                if LT(b_keys[ssb], a_keys[ssa]):
                    # We merge in descending order, so copy the larger value
                    keys[dest] = a_keys[ssa]
                    if _has_values:
                        values[dest] = a_values[ssa]
                    dest -= 1
                    ssa -= 1
                    na -= 1
                    if na == 0:
                        break
                    # It's a A run
                    acount += 1
                    bcount = 0
                    if acount >= min_gallop:
                        break
                else:
                    keys[dest] = b_keys[ssb]
                    if _has_values:
                        values[dest] = b_values[ssb]
                    dest -= 1
                    ssb -= 1
                    nb -= 1
                    if nb == 0:
                        break
                    # It's a B run
                    bcount += 1
                    acount = 0
                    if bcount >= min_gallop:
                        break

            # One run is winning so consistently that galloping may
            # be a huge win.  So try that, and continue galloping until
            # (if ever) neither run appears to be winning consistently
            # anymore.
            if DO_GALLOP and na > 0 and nb > 0:
                min_gallop += 1

                while acount >= MIN_GALLOP or bcount >= MIN_GALLOP:
                    # As long as we gallop without leaving this loop, make
                    # the heuristic more likely
                    min_gallop -= min_gallop > 1

                    # Gallop in A to find where keys[ssb] should end up
                    k = gallop_right(b_keys[ssb], a_keys, ssa - na + 1, ssa + 1, ssa)
                    # k is an index, make it a size from the end
                    k = ssa + 1 - k
                    acount = k
                    if k > 0:
                        # Copy everything from A after k.
                        # Destination and source are the same buffer, and destination
                        # index is greater, so copy from the end to the start.
                        sortslice_copy_down(keys, values, dest,
                                            a_keys, a_values, ssa,
                                            k)
                        dest -= k
                        ssa -= k
                        na -= k
                        if na == 0:
                            # Finished merging
                            break
                    # Copy keys[ssb]
                    keys[dest] = b_keys[ssb]
                    if _has_values:
                        values[dest] = b_values[ssb]
                    dest -= 1
                    ssb -= 1
                    nb -= 1
                    if nb == 0:
                        # Finished merging
                        break

                    # Gallop in B to find where keys[ssa] should end up
                    k = gallop_left(a_keys[ssa], b_keys, ssb - nb + 1, ssb + 1, ssb)
                    # k is an index, make it a size from the end
                    k = ssb + 1 - k
                    bcount = k
                    if k > 0:
                        # Copy everything from B before k
                        sortslice_copy_down(keys, values, dest,
                                            b_keys, b_values, ssb,
                                            k)
                        dest -= k
                        ssb -= k
                        nb -= k
                        if nb == 0:
                            # Finished merging
                            break
                    # Copy keys[ssa]
                    keys[dest] = a_keys[ssa]
                    if _has_values:
                        values[dest] = a_values[ssa]
                    dest -= 1
                    ssa -= 1
                    na -= 1
                    if na == 0:
                        # Finished merging
                        break

                # Penalize it for leaving galloping mode
                min_gallop += 1

        # Merge finished, now handle the remaining areas
        if na == 0:
            # Only B remaining to copy at the front of the destination area
            sortslice_copy(keys, values, dest - nb + 1,
                           b_keys, b_values, ssb - nb + 1,
                           nb)
        else:
            assert nb == 0
            assert dest == ssa
            # A's front is already at the right place, do nothing

        return merge_adjust_gallop(ms, min_gallop)


    @wrap
    def merge_at(ms, keys, values, i):
        """
        Merge the two runs at stack indices i and i+1.

        An updated MergeState is returned.
        """
        n = ms.n
        assert n >= 2
        assert i >= 0
        assert i == n - 2 or i == n - 3

        ssa, na = ms.pending[i]
        ssb, nb = ms.pending[i + 1]
        assert na > 0 and nb > 0
        assert ssa + na == ssb

        # Record the length of the combined runs; if i is the 3rd-last
        # run now, also slide over the last run (which isn't involved
        # in this merge).  The current run i+1 goes away in any case.
        ms.pending[i] = MergeRun(ssa, na + nb)
        if i == n - 3:
            ms.pending[i + 1] = ms.pending[i + 2]
        ms = merge_pop(ms)

        # Where does b start in a?  Elements in a before that can be
        # ignored (already in place).
        k = gallop_right(keys[ssb], keys, ssa, ssa + na, ssa)
        # [k, ssa + na) remains to be merged
        na -= k - ssa
        ssa = k
        if na == 0:
            return ms

        # Where does a end in b?  Elements in b after that can be
        # ignored (already in place).
        k = gallop_left(keys[ssa + na - 1], keys, ssb, ssb + nb, ssb + nb - 1)
        # [ssb, k) remains to be merged
        nb = k - ssb

        # Merge what remains of the runs, using a temp array with
        # min(na, nb) elements.
        if na <= nb:
            return merge_lo(ms, keys, values, ssa, na, ssb, nb)
        else:
            return merge_hi(ms, keys, values, ssa, na, ssb, nb)


    @wrap
    def merge_collapse(ms, keys, values):
        """
        Examine the stack of runs waiting to be merged, merging adjacent runs
        until the stack invariants are re-established:

        1. len[-3] > len[-2] + len[-1]
        2. len[-2] > len[-1]

        An updated MergeState is returned.

        See listsort.txt for more info.
        """
        while ms.n > 1:
            pending = ms.pending
            n = ms.n - 2
            if ((n > 0 and pending[n-1].size <= pending[n].size + pending[n+1].size) or
                (n > 1 and pending[n-2].size <= pending[n-1].size + pending[n].size)):
                if pending[n - 1].size < pending[n + 1].size:
                    # Merge smaller one first
                    n -= 1
                ms = merge_at(ms, keys, values, n)
            elif pending[n].size < pending[n + 1].size:
                ms = merge_at(ms, keys, values, n)
            else:
                break
        return ms

    @wrap
    def merge_force_collapse(ms, keys, values):
        """
        Regardless of invariants, merge all runs on the stack until only one
        remains.  This is used at the end of the mergesort.

        An updated MergeState is returned.
        """
        while ms.n > 1:
            pending = ms.pending
            n = ms.n - 2
            if n > 0:
                if pending[n - 1].size < pending[n + 1].size:
                    # Merge the smaller one first
                    n -= 1
            ms = merge_at(ms, keys, values, n)
        return ms


    @wrap
    def reverse_slice(keys, values, start, stop):
        """
        Reverse a slice, in-place.
        """
        i = start
        j = stop - 1
        while i < j:
            keys[i], keys[j] = keys[j], keys[i]
            i += 1
            j -= 1
        if has_values(keys, values):
            i = start
            j = stop - 1
            while i < j:
                values[i], values[j] = values[j], values[i]
                i += 1
                j -= 1


    @wrap
    def run_timsort_with_mergestate(ms, keys, values):
        """
        Run timsort with the mergestate.
        """
        nremaining = len(keys)
        if nremaining < 2:
            return

        # March over the array once, left to right, finding natural runs,
        # and extending short natural runs to minrun elements.
        minrun = merge_compute_minrun(nremaining)

        lo = zero
        while nremaining > 0:
            n, desc = count_run(keys, lo, lo + nremaining)
            if desc:
                # Descending run => reverse
                reverse_slice(keys, values, lo, lo + n)
            # If short, extend to min(minrun, nremaining)
            if n < minrun:
                force = min(minrun, nremaining)
                binarysort(keys, values, lo, lo + force, lo + n)
                n = force
            # Push run onto stack, and maybe merge.
            ms = merge_append(ms, MergeRun(lo, n))
            ms = merge_collapse(ms, keys, values)
            # Advance to find next run.
            lo += n
            nremaining -= n

        # All initial runs have been discovered, now finish merging.
        ms = merge_force_collapse(ms, keys, values)
        assert ms.n == 1
        assert ms.pending[0] == (0, len(keys))


    @wrap
    def run_timsort(keys):
        """
        Run timsort over the given keys.
        """
        values = keys
        run_timsort_with_mergestate(merge_init(keys), keys, values)


    @wrap
    def run_timsort_with_values(keys, values):
        """
        Run timsort over the given keys and values.
        """
        run_timsort_with_mergestate(merge_init_with_values(keys, values),
                                    keys, values)

    return TimsortImplementation(
        wrap,
        count_run, binarysort, gallop_left, gallop_right,
        merge_init, merge_append, merge_pop,
        merge_compute_minrun, merge_lo, merge_hi, merge_at,
        merge_force_collapse, merge_collapse,
        run_timsort, run_timsort_with_values)


def make_py_timsort(*args):
    return make_timsort_impl((lambda f: f), *args)

def make_jit_timsort(*args):
    from numba import jit
    return make_timsort_impl((lambda f: jit(nopython=True)(f)),
                              *args)