struct Slice(T)
  protected def self.intro_sort!(a, n)
    return if n < 2
    quick_sort_for_intro_sort!(a, n, (n.bit_length - 1) * 2)
    insertion_sort!(a, n)
  end

  protected def self.quick_sort_for_intro_sort!(a, n, d)
    while n > 16
      if d == 0
        heap_sort!(a, n)
        return
      end
      d -= 1
      center_median!(a, n)
      c = partition_for_quick_sort!(a, n)
      quick_sort_for_intro_sort!(c, n - (c - a), d)
      n = c - a
    end
  end

  protected def self.heap_sort!(a, n)
    (n // 2).downto 0 do |p|
      heapify!(a, p, n)
    end
    while n > 1
      n -= 1
      a.value, a[n] = a[n], a.value
      heapify!(a, 0, n)
    end
  end

  protected def self.heapify!(a, p, n)
    v, c = a[p], p
    while c < (n - 1) // 2
      c = 2 * (c + 1)
      c -= 1 if cmp(a[c], a[c - 1]) < 0
      break unless cmp(v, a[c]) <= 0
      a[p] = a[c]
      p = c
    end
    if n & 1 == 0 && c == n // 2 - 1
      c = 2 * c + 1
      if cmp(v, a[c]) < 0
        a[p] = a[c]
        p = c
      end
    end
    a[p] = v
  end

  protected def self.center_median!(a, n)
    b, c = a + n // 2, a + n - 1
    if cmp(a.value, b.value) <= 0
      if cmp(b.value, c.value) <= 0
        return
      elsif cmp(a.value, c.value) <= 0
        b.value, c.value = c.value, b.value
      else
        a.value, b.value, c.value = c.value, a.value, b.value
      end
    elsif cmp(a.value, c.value) <= 0
      a.value, b.value = b.value, a.value
    elsif cmp(b.value, c.value) <= 0
      a.value, b.value, c.value = b.value, c.value, a.value
    else
      a.value, c.value = c.value, a.value
    end
  end

  protected def self.partition_for_quick_sort!(a, n)
    v, l, r = a[n // 2], a + 1, a + n - 1
    loop do
      while cmp(l.value, v) < 0
        l += 1
      end
      r -= 1
      while cmp(v, r.value) < 0
        r -= 1
      end
      return l unless l < r
      l.value, r.value = r.value, l.value
      l += 1
    end
  end

  protected def self.insertion_sort!(a, n)
    (1...n).each do |i|
      l = a + i
      v = l.value
      p = l - 1
      while l > a && cmp(v, p.value) < 0
        l.value = p.value
        l, p = p, p - 1
      end
      l.value = v
    end
  end

  protected def self.intro_sort!(a, n, comp)
    return if n < 2
    quick_sort_for_intro_sort!(a, n, (n.bit_length - 1) * 2, comp)
    insertion_sort!(a, n, comp)
  end

  protected def self.quick_sort_for_intro_sort!(a, n, d, comp)
    while n > 16
      if d == 0
        heap_sort!(a, n, comp)
        return
      end
      d -= 1
      center_median!(a, n, comp)
      c = partition_for_quick_sort!(a, n, comp)
      quick_sort_for_intro_sort!(c, n - (c - a), d, comp)
      n = c - a
    end
  end

  protected def self.heap_sort!(a, n, comp)
    (n // 2).downto 0 do |p|
      heapify!(a, p, n, comp)
    end
    while n > 1
      n -= 1
      a.value, a[n] = a[n], a.value
      heapify!(a, 0, n, comp)
    end
  end

  protected def self.heapify!(a, p, n, comp)
    v, c = a[p], p
    while c < (n - 1) // 2
      c = 2 * (c + 1)
      c -= 1 if cmp(a[c], a[c - 1], comp) < 0
      break unless cmp(v, a[c], comp) <= 0
      a[p] = a[c]
      p = c
    end
    if n & 1 == 0 && c == n // 2 - 1
      c = 2 * c + 1
      if cmp(v, a[c], comp) < 0
        a[p] = a[c]
        p = c
      end
    end
    a[p] = v
  end

  protected def self.center_median!(a, n, comp)
    b, c = a + n // 2, a + n - 1
    if cmp(a.value, b.value, comp) <= 0
      if cmp(b.value, c.value, comp) <= 0
        return
      elsif cmp(a.value, c.value, comp) <= 0
        b.value, c.value = c.value, b.value
      else
        a.value, b.value, c.value = c.value, a.value, b.value
      end
    elsif cmp(a.value, c.value, comp) <= 0
      a.value, b.value = b.value, a.value
    elsif cmp(b.value, c.value, comp) <= 0
      a.value, b.value, c.value = b.value, c.value, a.value
    else
      a.value, c.value = c.value, a.value
    end
  end

  protected def self.partition_for_quick_sort!(a, n, comp)
    v, l, r = a[n // 2], a + 1, a + n - 1
    loop do
      while l < a + n && cmp(l.value, v, comp) < 0
        l += 1
      end
      r -= 1
      while r >= a && cmp(v, r.value, comp) < 0
        r -= 1
      end
      return l unless l < r
      l.value, r.value = r.value, l.value
      l += 1
    end
  end

  protected def self.insertion_sort!(a, n, comp)
    (1...n).each do |i|
      l = a + i
      v = l.value
      p = l - 1
      while l > a && cmp(v, p.value, comp) < 0
        l.value = p.value
        l, p = p, p - 1
      end
      l.value = v
    end
  end

  protected def self.cmp(v1, v2)
    v = v1 <=> v2
    raise ArgumentError.new("Comparison of #{v1} and #{v2} failed") if v.nil?
    v
  end

  protected def self.cmp(v1, v2, block)
    v = block.call(v1, v2)
    raise ArgumentError.new("Comparison of #{v1} and #{v2} failed") if v.nil?
    v
  end

  # The stable sort implementation is ported from Rust.
  # https://github.com/rust-lang/rust/blob/507bff92fadf1f25a830da5065a5a87113345163/library/alloc/src/slice.rs
  #
  # Rust License (MIT):
  #
  # Permission is hereby granted, free of charge, to any
  # person obtaining a copy of this software and associated
  # documentation files (the "Software"), to deal in the
  # Software without restriction, including without
  # limitation the rights to use, copy, modify, merge,
  # publish, distribute, sublicense, and/or sell copies of
  # the Software, and to permit persons to whom the Software
  # is furnished to do so, subject to the following
  # conditions:
  #
  # The above copyright notice and this permission notice
  # shall be included in all copies or substantial portions
  # of the Software.
  #
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
  # ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
  # SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
  # IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  # DEALINGS IN THE SOFTWARE.

  # Slices of up to this length get sorted using insertion sort.
  private MAX_INSERTION = 10

  # Very short runs are extended using insertion sort to span at least this many elements.
  private MIN_RUN = 10

  # This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
  # [here](http://svn.python.org/projects/python/trunk/Objects/listsort.txt).
  #
  # The algorithm identifies strictly descending and non-descending subsequences, which are called
  # natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
  # onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
  # satisfied:
  #
  # 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
  # 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
  #
  # The invariants ensure that the total running time is `O(n * log(n))` worst-case.
  protected def self.merge_sort!(v : Slice(T)) forall T
    size = v.size

    # Short arrays get sorted in-place via insertion sort to avoid allocations.
    if size <= MAX_INSERTION
      if size >= 2
        (size - 1).downto(0) { |i| insert_head!(v[i..]) }
      end
      return
    end

    # Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
    # shallow copies of the contents of `v` without risking the dtors running on copies if
    # `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
    # which will always have length at most `len / 2`.
    buf = Pointer(T).malloc(size // 2)

    # In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
    # strange decision, but consider the fact that merges more often go in the opposite direction
    # (forwards). According to benchmarks, merging forwards is slightly faster than merging
    # backwards. To conclude, identifying runs by traversing backwards improves performance.
    runs = [] of Range(Int32, Int32)
    last = size
    while last > 0
      # Find the next natural run, and reverse it if it's strictly descending.
      start = last - 1
      if start > 0
        start -= 1
        if cmp(v[start + 1], v[start]) < 0
          while start > 0 && cmp(v[start], v[start - 1]) < 0
            start -= 1
          end
          v[start...last].reverse!
        else
          while start > 0 && cmp(v[start], v[start - 1]) > 0
            start -= 1
          end
        end
      end

      # Insert some more elements into the run if it's too short. Insertion sort is faster than
      # merge sort on short sequences, so this significantly improves performance.
      while start > 0 && last - start < MIN_RUN
        start -= 1
        insert_head!(v[start...last])
      end

      # Push this run onto the stack.
      runs.push(start...last)
      last = start

      # Merge some pairs of adjacent runs to satisfy the invariants.
      while r = collapse(runs)
        left = runs[r + 1]
        right = runs[r]
        merge!(v[left.begin...right.end], left.size, buf)
        runs[r] = left.begin...right.end
        runs.delete_at(r + 1)
      end
    end
  end

  # Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
  #
  # This is the integral subroutine of insertion sort.
  protected def self.insert_head!(v)
    if v.size >= 2 && cmp(v[1], v[0]) < 0
      x, v[0] = v[0], v[1]
      (2...v.size).each do |i|
        if cmp(v[i], x) < 0
          v[i - 1] = v[i]
        else
          v[i - 1] = x
          return
        end
      end
      v[v.size - 1] = x
    end
  end

  # Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
  # stores the result into `v[..]`.
  protected def self.merge!(v, mid, buf)
    size = v.size

    if mid <= size - mid
      # The left run is shorter.
      buf.copy_from(v.to_unsafe, mid)

      left = 0
      right = mid
      out = v.to_unsafe

      while left < mid && right < size
        # Consume the lesser side.
        # If equal, prefer the left run to maintain stability.
        if cmp(v[right], buf[left]) < 0
          out.value = v[right]
          out += 1
          right += 1
        else
          out.value = buf[left]
          out += 1
          left += 1
        end
      end

      out.copy_from(buf + left, mid - left)
    else
      # The right run is shorter.
      buf.copy_from((v + mid).to_unsafe, size - mid)

      left = mid
      right = size - mid
      out = v.to_unsafe + size

      while left > 0 && right > 0
        # Consume the greater side.
        # If equal, prefer the right run to maintain stability.
        if cmp(buf[right - 1], v[left - 1]) < 0
          left -= 1
          out -= 1
          out.value = v[left]
        else
          right -= 1
          out -= 1
          out.value = buf[right]
        end
      end

      (v + left).copy_from(buf, right)
    end
  end

  # This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
  # [here](http://svn.python.org/projects/python/trunk/Objects/listsort.txt).
  #
  # The algorithm identifies strictly descending and non-descending subsequences, which are called
  # natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
  # onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
  # satisfied:
  #
  # 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
  # 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
  #
  # The invariants ensure that the total running time is `O(n * log(n))` worst-case.
  protected def self.merge_sort!(v : Slice(T), comp) forall T
    size = v.size

    # Short arrays get sorted in-place via insertion sort to avoid allocations.
    if size <= MAX_INSERTION
      if size >= 2
        (size - 1).downto(0) { |i| insert_head!(v[i..], comp) }
      end
      return
    end

    # Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
    # shallow copies of the contents of `v` without risking the dtors running on copies if
    # `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
    # which will always have length at most `len / 2`.
    buf = Pointer(T).malloc(size // 2)

    # In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
    # strange decision, but consider the fact that merges more often go in the opposite direction
    # (forwards). According to benchmarks, merging forwards is slightly faster than merging
    # backwards. To conclude, identifying runs by traversing backwards improves performance.
    runs = [] of Range(Int32, Int32)
    last = size
    while last > 0
      # Find the next natural run, and reverse it if it's strictly descending.
      start = last - 1
      if start > 0
        start -= 1
        if cmp(v[start + 1], v[start], comp) < 0
          while start > 0 && cmp(v[start], v[start - 1], comp) < 0
            start -= 1
          end
          v[start...last].reverse!
        else
          while start > 0 && cmp(v[start], v[start - 1], comp) > 0
            start -= 1
          end
        end
      end

      # Insert some more elements into the run if it's too short. Insertion sort is faster than
      # merge sort on short sequences, so this significantly improves performance.
      while start > 0 && last - start < MIN_RUN
        start -= 1
        insert_head!(v[start...last], comp)
      end

      # Push this run onto the stack.
      runs.push(start...last)
      last = start

      # Merge some pairs of adjacent runs to satisfy the invariants.
      while r = collapse(runs)
        left = runs[r + 1]
        right = runs[r]
        merge!(v[left.begin...right.end], left.size, buf, comp)
        runs[r] = left.begin...right.end
        runs.delete_at(r + 1)
      end
    end
  end

  # Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
  #
  # This is the integral subroutine of insertion sort.
  protected def self.insert_head!(v, comp)
    if v.size >= 2 && cmp(v[1], v[0], comp) < 0
      x, v[0] = v[0], v[1]
      (2...v.size).each do |i|
        if cmp(v[i], x, comp) < 0
          v[i - 1] = v[i]
        else
          v[i - 1] = x
          return
        end
      end
      v[v.size - 1] = x
    end
  end

  # Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
  # stores the result into `v[..]`.
  protected def self.merge!(v, mid, buf, comp)
    size = v.size

    if mid <= size - mid
      # The left run is shorter.
      buf.copy_from(v.to_unsafe, mid)

      left = 0
      right = mid
      out = v.to_unsafe

      while left < mid && right < size
        # Consume the lesser side.
        # If equal, prefer the left run to maintain stability.
        if cmp(v[right], buf[left], comp) < 0
          out.value = v[right]
          out += 1
          right += 1
        else
          out.value = buf[left]
          out += 1
          left += 1
        end
      end

      out.copy_from(buf + left, mid - left)
    else
      # The right run is shorter.
      buf.copy_from((v + mid).to_unsafe, size - mid)

      left = mid
      right = size - mid
      out = v.to_unsafe + size

      while left > 0 && right > 0
        # Consume the greater side.
        # If equal, prefer the right run to maintain stability.
        if cmp(buf[right - 1], v[left - 1], comp) < 0
          left -= 1
          out -= 1
          out.value = v[left]
        else
          right -= 1
          out -= 1
          out.value = buf[right]
        end
      end

      (v + left).copy_from(buf, right)
    end
  end

  # Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
  # if `r` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
  # algorithm should continue building a new run instead, `nil` is returned.
  #
  # TimSort is infamous for its buggy implementations, as described here:
  # http://envisage-project.eu/timsort-specification-and-verification/
  #
  # The gist of the story is: we must enforce the invariants on the top four runs on the stack.
  # Enforcing them on just top three is not sufficient to ensure that the invariants will still
  # hold for *all* runs in the stack.
  #
  # This function correctly checks invariants for the top four runs. Additionally, if the top
  # run starts at index 0, it will always demand a merge operation until the stack is fully
  # collapsed, in order to complete the sort.
  @[AlwaysInline]
  protected def self.collapse(runs)
    n = runs.size
    if n >= 2 &&
       (runs[n - 1].begin == 0 ||
       runs[n - 2].size <= runs[n - 1].size ||
       (n >= 3 && runs[n - 3].size <= runs[n - 2].size + runs[n - 1].size) ||
       (n >= 4 && runs[n - 4].size <= runs[n - 3].size + runs[n - 2].size))
      n >= 3 && runs[n - 3].size < runs[n - 1].size ? n - 3 : n - 2
    end
  end
end