# Source port of Dragonbox's reference implementation in C++.
#
# The following is their license:
#
#   Copyright 2020-2021 Junekey Jeon
#
#   The contents of this file may be used under the terms of
#   the Apache License v2.0 with LLVM Exceptions.
#
#      (See accompanying file LICENSE-Apache or copy at
#       https://llvm.org/foundation/relicensing/LICENSE.txt)
#
#   Alternatively, the contents of this file may be used under the terms of
#   the Boost Software License, Version 1.0.
#      (See accompanying file LICENSE-Boost or copy at
#       https://www.boost.org/LICENSE_1_0.txt)
#
#   Unless required by applicable law or agreed to in writing, this software
#   is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#   KIND, either express or implied.
module Float::Printer::Dragonbox
  # Current revision: https://github.com/jk-jeon/dragonbox/tree/33a9e021290d529bcb41773be2c7c3c91726a9cb
  #
  # Assumes the following policies:
  #
  # * `jkj::dragonbox::policy::sign::ignore`
  # * `jkj::dragonbox::policy::trailing_zero::ignore`
  # * `jkj::dragonbox::policy::decimal_to_binary_rounding::nearest_to_even` (default)
  # * `jkj::dragonbox::policy::binary_to_decimal_rounding::to_even` (default)
  # * `jkj::dragonbox::policy::cache::full` (default)

  # Utilities for wide unsigned integer arithmetic.
  private module WUInt
    # TODO: use built-in integer type
    record UInt128, high : UInt64, low : UInt64 do
      def unsafe_add!(n : UInt64) : self
        sum = @low &+ n
        @high &+= (sum < @low ? 1 : 0)
        @low = sum
        self
      end
    end

    def self.umul64(x : UInt32, y : UInt32) : UInt64
      x.to_u64 &* y
    end

    # Get 128-bit result of multiplication of two 64-bit unsigned integers.
    def self.umul128(x : UInt64, y : UInt64) : UInt128
      a = (x >> 32).to_u32!
      b = x.to_u32!
      c = (y >> 32).to_u32!
      d = y.to_u32!

      ac = umul64(a, c)
      bc = umul64(b, c)
      ad = umul64(a, d)
      bd = umul64(b, d)

      intermediate = (bd >> 32) &+ ad.to_u32! &+ bc.to_u32!

      UInt128.new(
        high: ac &+ (intermediate >> 32) &+ (ad >> 32) &+ (bc >> 32),
        low: (intermediate << 32) &+ bd.to_u32!,
      )
    end

    def self.umul128_upper64(x : UInt64, y : UInt64) : UInt64
      a = (x >> 32).to_u32!
      b = x.to_u32!
      c = (y >> 32).to_u32!
      d = y.to_u32!

      ac = umul64(a, c)
      bc = umul64(b, c)
      ad = umul64(a, d)
      bd = umul64(b, d)

      intermediate = (bd >> 32) &+ ad.to_u32! &+ bc.to_u32!
      ac &+ (intermediate >> 32) &+ (ad >> 32) &+ (bc >> 32)
    end

    # Get upper 128-bits of multiplication of a 64-bit unsigned integer and a 128-bit unsigned integer.
    def self.umul192_upper128(x : UInt64, y : UInt128) : UInt128
      r = umul128(x, y.high)
      r.unsafe_add!(umul128_upper64(x, y.low))
      r
    end

    # Get upper 64-bits of multiplication of a 32-bit unsigned integer and a 64-bit unsigned integer.
    def self.umul96_upper64(x : UInt32, y : UInt64) : UInt64
      yh = (y >> 32).to_u32!
      yl = y.to_u32!

      xyh = umul64(x, yh)
      xyl = umul64(x, yl)

      xyh &+ (xyl >> 32)
    end

    # Get lower 128-bits of multiplication of a 64-bit unsigned integer and a 128-bit unsigned integer.
    def self.umul192_lower128(x : UInt64, y : UInt128) : UInt128
      high = x &* y.high
      high_low = umul128(x, y.low)
      UInt128.new(
        high: high &+ high_low.high,
        low: high_low.low,
      )
    end

    # Get lower 64-bits of multiplication of a 32-bit unsigned integer and a 64-bit unsigned integer.
    def self.umul96_lower64(x : UInt32, y : UInt64) : UInt64
      y &* x
    end
  end

  # Utilities for fast log computation.
  private module Log
    def self.floor_log10_pow2(e : Int)
      # Precondition: `-2620 <= e <= 2620`
      (e &* 315653) >> 20
    end

    def self.floor_log2_pow10(e : Int)
      # Precondition: `-1233 <= e <= 1233`
      (e &* 1741647) >> 19
    end

    def self.floor_log10_pow2_minus_log10_4_over_3(e : Int)
      # Precondition: `-2985 <= e <= 2936`
      (e &* 631305 &- 261663) >> 21
    end
  end

  # Utilities for fast divisibility tests.
  private module Div
    CACHED_POWERS_OF_5_TABLE_U32 = [
      {0x00000001_u32, 0xffffffff_u32},
      {0xcccccccd_u32, 0x33333333_u32},
      {0xc28f5c29_u32, 0x0a3d70a3_u32},
      {0x26e978d5_u32, 0x020c49ba_u32},
      {0x3afb7e91_u32, 0x0068db8b_u32},
      {0x0bcbe61d_u32, 0x0014f8b5_u32},
      {0x68c26139_u32, 0x000431bd_u32},
      {0xae8d46a5_u32, 0x0000d6bf_u32},
      {0x22e90e21_u32, 0x00002af3_u32},
      {0x3a2e9c6d_u32, 0x00000897_u32},
      {0x3ed61f49_u32, 0x000001b7_u32},
      {0x0c913975_u32, 0x00000057_u32},
      {0xcf503eb1_u32, 0x00000011_u32},
      {0xf6433fbd_u32, 0x00000003_u32},
      {0x3140a659_u32, 0x00000002_u32},
      {0x70402145_u32, 0x00000009_u32},
      {0x7cd9a041_u32, 0x00000001_u32},
      {0xe5c5200d_u32, 0x00000001_u32},
      {0xfac10669_u32, 0x00000005_u32},
      {0x6559ce15_u32, 0x00000001_u32},
      {0xaddec2d1_u32, 0x00000002_u32},
      {0x892c8d5d_u32, 0x00000003_u32},
      {0x1b6f4f79_u32, 0x00000001_u32},
      {0x6be30fe5_u32, 0x00000001_u32},
    ]

    CACHED_POWERS_OF_5_TABLE_U64 = [
      {0x0000000000000001_u64, 0xffffffffffffffff_u64},
      {0xcccccccccccccccd_u64, 0x3333333333333333_u64},
      {0x8f5c28f5c28f5c29_u64, 0x0a3d70a3d70a3d70_u64},
      {0x1cac083126e978d5_u64, 0x020c49ba5e353f7c_u64},
      {0xd288ce703afb7e91_u64, 0x0068db8bac710cb2_u64},
      {0x5d4e8fb00bcbe61d_u64, 0x0014f8b588e368f0_u64},
      {0x790fb65668c26139_u64, 0x000431bde82d7b63_u64},
      {0xe5032477ae8d46a5_u64, 0x0000d6bf94d5e57a_u64},
      {0xc767074b22e90e21_u64, 0x00002af31dc46118_u64},
      {0x8e47ce423a2e9c6d_u64, 0x0000089705f4136b_u64},
      {0x4fa7f60d3ed61f49_u64, 0x000001b7cdfd9d7b_u64},
      {0x0fee64690c913975_u64, 0x00000057f5ff85e5_u64},
      {0x3662e0e1cf503eb1_u64, 0x000000119799812d_u64},
      {0xa47a2cf9f6433fbd_u64, 0x0000000384b84d09_u64},
      {0x54186f653140a659_u64, 0x00000000b424dc35_u64},
      {0x7738164770402145_u64, 0x0000000024075f3d_u64},
      {0xe4a4d1417cd9a041_u64, 0x000000000734aca5_u64},
      {0xc75429d9e5c5200d_u64, 0x000000000170ef54_u64},
      {0xc1773b91fac10669_u64, 0x000000000049c977_u64},
      {0x26b172506559ce15_u64, 0x00000000000ec1e4_u64},
      {0xd489e3a9addec2d1_u64, 0x000000000002f394_u64},
      {0x90e860bb892c8d5d_u64, 0x000000000000971d_u64},
      {0x502e79bf1b6f4f79_u64, 0x0000000000001e39_u64},
      {0xdcd618596be30fe5_u64, 0x000000000000060b_u64},
    ]

    module DIVIDE_BY_POW10_INFO_F32
      MAGIC_NUMBER = 6554_u32
      SHIFT_AMOUNT =       16
    end

    module DIVIDE_BY_POW10_INFO_F64
      MAGIC_NUMBER = 656_u32
      SHIFT_AMOUNT =      16
    end

    # N == 1
    def self.check_divisibility_and_divide_by_pow10_k1(n : UInt32)
      n &*= DIVIDE_BY_POW10_INFO_F32::MAGIC_NUMBER

      # Mask for the lowest (divisibility_check_bits)-bits.
      divisibility_check_bits = DIVIDE_BY_POW10_INFO_F32::SHIFT_AMOUNT
      comparison_mask = ~(UInt32::MAX << divisibility_check_bits)
      result = n & comparison_mask < DIVIDE_BY_POW10_INFO_F32::MAGIC_NUMBER

      n >>= divisibility_check_bits
      {n, result}
    end

    # N == 2
    def self.check_divisibility_and_divide_by_pow10_k2(n : UInt32)
      n &*= DIVIDE_BY_POW10_INFO_F64::MAGIC_NUMBER

      # Mask for the lowest (divisibility_check_bits)-bits.
      divisibility_check_bits = DIVIDE_BY_POW10_INFO_F64::SHIFT_AMOUNT
      comparison_mask = ~(UInt32::MAX << divisibility_check_bits)
      result = n & comparison_mask < DIVIDE_BY_POW10_INFO_F64::MAGIC_NUMBER

      n >>= divisibility_check_bits
      {n, result}
    end
  end

  private module ImplInfoMethods(D)
    def extract_exponent_bits(u : D::CarrierUInt)
      exponent_bits_mask = ~(UInt32::MAX << D::EXPONENT_BITS)
      ((u >> D::SIGNIFICAND_BITS) & exponent_bits_mask).to_u32!
    end

    def remove_exponent_bits(u : D::CarrierUInt, exponent_bits)
      u ^ (D::CarrierUInt.new!(exponent_bits) << D::SIGNIFICAND_BITS)
    end

    def remove_sign_bit_and_shift(u : D::CarrierUInt)
      u << 1
    end

    def check_divisibility_and_divide_by_pow10(n : UInt32)
      {% if D::KAPPA == 1 %}
        Div.check_divisibility_and_divide_by_pow10_k1(n)
      {% elsif D::KAPPA == 2 %}
        Div.check_divisibility_and_divide_by_pow10_k2(n)
      {% else %}
        {% raise "Expected kappa == 1 or kappa == 2" %}
      {% end %}
    end
  end

  private module ImplInfo_Float32
    extend ImplInfoMethods(self)

    SIGNIFICAND_BITS =   23
    EXPONENT_BITS    =    8
    MIN_EXPONENT     = -126
    MAX_EXPONENT     =  127
    EXPONENT_BIAS    = -127
    DECIMAL_DIGITS   =    9

    alias CarrierUInt = UInt32
    CARRIER_BITS = 32

    KAPPA =   1
    MIN_K = -31
    # MAX_K = 46
    CACHE_BITS = 64

    SHORTER_INTERVAL_TIE_LOWER_THRESHOLD                = -35
    SHORTER_INTERVAL_TIE_UPPER_THRESHOLD                = -35
    CASE_SHORTER_INTERVAL_LEFT_ENDPOINT_LOWER_THRESHOLD =   2
    CASE_SHORTER_INTERVAL_LEFT_ENDPOINT_UPPER_THRESHOLD =   3

    BIG_DIVISOR   = 10_u32 ** (KAPPA + 1)
    SMALL_DIVISOR = 10_u32 ** KAPPA
  end

  private module ImplInfo_Float64
    extend ImplInfoMethods(self)

    SIGNIFICAND_BITS =    52
    EXPONENT_BITS    =    11
    MIN_EXPONENT     = -1022
    MAX_EXPONENT     =  1023
    EXPONENT_BIAS    = -1023
    DECIMAL_DIGITS   =    17

    alias CarrierUInt = UInt64
    CARRIER_BITS = 64

    KAPPA =    2
    MIN_K = -292
    # MAX_K = 326
    CACHE_BITS = 128

    SHORTER_INTERVAL_TIE_LOWER_THRESHOLD                = -77
    SHORTER_INTERVAL_TIE_UPPER_THRESHOLD                = -77
    CASE_SHORTER_INTERVAL_LEFT_ENDPOINT_LOWER_THRESHOLD =   2
    CASE_SHORTER_INTERVAL_LEFT_ENDPOINT_UPPER_THRESHOLD =   3

    BIG_DIVISOR   = 10_u32 ** (KAPPA + 1)
    SMALL_DIVISOR = 10_u32 ** KAPPA
  end

  private module Impl(F, ImplInfo)
    def self.prefer_round_down?(significand)
      significand % 2 != 0
    end

    def self.compute_nearest_normal(two_fc, exponent, is_closed)
      # Step 1: Schubfach multiplier calculation

      # Compute k and beta.
      minus_k = Log.floor_log10_pow2(exponent) - ImplInfo::KAPPA
      cache = ImplInfo.get_cache(-minus_k)
      beta = exponent + Log.floor_log2_pow10(-minus_k)

      # Compute zi and deltai.
      # 10^kappa <= deltai < 10^(kappa + 1)
      deltai = compute_delta(cache, beta)
      # For the case of binary32, the result of integer check is not correct for
      # 29711844 * 2^-82
      # = 6.1442653300000000008655037797566933477355632930994033813476... * 10^-18
      # and 29711844 * 2^-81
      # = 1.2288530660000000001731007559513386695471126586198806762695... * 10^-17,
      # and they are the unique counterexamples. However, since 29711844 is even,
      # this does not cause any problem for the endpoints calculations; it can only
      # cause a problem when we need to perform integer check for the center.
      # Fortunately, with these inputs, that branch is never executed, so we are fine.
      zi, is_z_integer = compute_mul((two_fc | 1) << beta, cache)

      # Step 2: Try larger divisor
      big_divisor = ImplInfo::BIG_DIVISOR
      small_divisor = ImplInfo::SMALL_DIVISOR

      significand = zi // big_divisor
      r = (zi - significand * big_divisor).to_u32!

      case r
      when .>(deltai)
        # do nothing
      when .<(deltai)
        # Exclude the right endpoint if necessary.
        if r == 0 && is_z_integer && !is_closed
          significand -= 1
          r = big_divisor
        else
          ret_exponent = minus_k + ImplInfo::KAPPA + 1
          return {significand, ret_exponent}
        end
      else
        # r == deltai; compare fractional parts.
        xi_parity, x_is_integer = compute_mul_parity(two_fc - 1, cache, beta)

        if xi_parity || (x_is_integer && is_closed)
          ret_exponent = minus_k + ImplInfo::KAPPA + 1
          return {significand, ret_exponent}
        end
      end

      # Step 3: Find the significand with the smaller divisor
      significand *= 10
      ret_exponent = minus_k + ImplInfo::KAPPA

      dist = r - deltai // 2 + small_divisor // 2
      approx_y_parity = ((dist ^ (small_divisor // 2)) & 1) != 0

      # Is dist divisible by 10^kappa?
      dist, divisible_by_small_divisor = ImplInfo.check_divisibility_and_divide_by_pow10(dist)

      # Add dist / 10^kappa to the significand.
      significand += dist

      if divisible_by_small_divisor
        # Check z^(f) >= epsilon^(f).
        # We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1,
        # where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f).
        # Since there are only 2 possibilities, we only need to care about the parity.
        # Also, zi and r should have the same parity since the divisor
        # is an even number.
        yi_parity, is_y_integer = compute_mul_parity(two_fc, cache, beta)
        if yi_parity != approx_y_parity
          significand -= 1
        elsif prefer_round_down?(significand) && is_y_integer
          # If z^(f) >= epsilon^(f), we might have a tie
          # when z^(f) == epsilon^(f), or equivalently, when y is an integer.
          # For tie-to-up case, we can just choose the upper one.
          significand -= 1
        end
      end

      {significand, ret_exponent}
    end

    def self.compute_nearest_shorter(exponent)
      # Compute k and beta.
      minus_k = Log.floor_log10_pow2_minus_log10_4_over_3(exponent)
      beta = exponent + Log.floor_log2_pow10(-minus_k)

      # Compute xi and zi.
      cache = ImplInfo.get_cache(-minus_k)

      xi = compute_left_endpoint_for_shorter_interval_case(cache, beta)
      zi = compute_right_endpoint_for_shorter_interval_case(cache, beta)

      # If we don't accept the left endpoint or
      # if the left endpoint is not an integer, increase it.
      xi += 1 if !is_left_endpoint_integer_shorter_interval?(exponent)

      # Try bigger divisor.
      significand = zi // 10

      # If succeed, return.
      if significand * 10 >= xi
        ret_exponent = minus_k + 1
        return {significand, ret_exponent}
      end

      # Otherwise, compute the round-up of y
      significand = compute_round_up_for_shorter_interval_case(cache, beta)
      ret_exponent = minus_k

      # When tie occurs, choose one of them according to the rule.
      if prefer_round_down?(significand) && (ImplInfo::SHORTER_INTERVAL_TIE_LOWER_THRESHOLD <= exponent <= ImplInfo::SHORTER_INTERVAL_TIE_UPPER_THRESHOLD)
        significand -= 1
      elsif significand < xi
        significand += 1
      end

      {significand, ret_exponent}
    end

    def self.compute_mul(u, cache) # : {result: ImplInfo::CarrierUInt, is_integer: Bool}
      {% if F == Float32 %}
        r = WUInt.umul96_upper64(u, cache)
        {
          ImplInfo::CarrierUInt.new!(r >> 32),
          ImplInfo::CarrierUInt.new!(r) == 0,
        }
      {% else %}
        # F == Float64
        r = WUInt.umul192_upper128(u, cache)
        {r.high, r.low == 0}
      {% end %}
    end

    def self.compute_delta(cache, beta) : UInt32
      {% if F == Float32 %}
        (cache >> (ImplInfo::CACHE_BITS - 1 - beta)).to_u32!
      {% else %}
        # F == Float64
        (cache.high >> (ImplInfo::CARRIER_BITS - 1 - beta)).to_u32!
      {% end %}
    end

    def self.compute_mul_parity(two_f, cache, beta) # : {parity: Bool, is_integer: Bool}
      {% if F == Float32 %}
        r = WUInt.umul96_lower64(two_f, cache)
        {
          ((r >> (64 - beta)) & 1) != 0,
          UInt32.new!(r >> (32 - beta)) == 0,
        }
      {% else %}
        # F == Float64
        r = WUInt.umul192_lower128(two_f, cache)
        {
          ((r.high >> (64 - beta)) & 1) != 0,
          (r.high << beta) | (r.low >> (64 - beta)) == 0,
        }
      {% end %}
    end

    def self.compute_left_endpoint_for_shorter_interval_case(cache, beta)
      significand_bits = ImplInfo::SIGNIFICAND_BITS

      ImplInfo::CarrierUInt.new!(
        {% if F == Float32 %}
          (cache - (cache >> (significand_bits + 2))) >> (ImplInfo::CACHE_BITS - significand_bits - 1 - beta)
        {% else %}
          # F == Float64
          (cache.high - (cache.high >> (significand_bits + 2))) >> (ImplInfo::CARRIER_BITS - significand_bits - 1 - beta)
        {% end %}
      )
    end

    def self.compute_right_endpoint_for_shorter_interval_case(cache, beta)
      significand_bits = ImplInfo::SIGNIFICAND_BITS

      ImplInfo::CarrierUInt.new!(
        {% if F == Float32 %}
          (cache + (cache >> (significand_bits + 1))) >> (ImplInfo::CACHE_BITS - significand_bits - 1 - beta)
        {% else %}
          # F == Float64
          (cache.high + (cache.high >> (significand_bits + 1))) >> (ImplInfo::CARRIER_BITS - significand_bits - 1 - beta)
        {% end %}
      )
    end

    def self.compute_round_up_for_shorter_interval_case(cache, beta)
      significand_bits = ImplInfo::SIGNIFICAND_BITS

      {% if F == Float32 %}
        (ImplInfo::CarrierUInt.new!(cache >> (ImplInfo::CACHE_BITS - significand_bits - 2 - beta)) + 1) // 2
      {% else %}
        # F == Float64
        ((cache.high >> (ImplInfo::CARRIER_BITS - significand_bits - 2 - beta)) + 1) // 2
      {% end %}
    end

    def self.is_left_endpoint_integer_shorter_interval?(exponent)
      ImplInfo::CASE_SHORTER_INTERVAL_LEFT_ENDPOINT_LOWER_THRESHOLD <=
        exponent <= ImplInfo::CASE_SHORTER_INTERVAL_LEFT_ENDPOINT_UPPER_THRESHOLD
    end

    def self.to_decimal(signed_significand_bits, exponent_bits)
      two_fc = ImplInfo.remove_sign_bit_and_shift(signed_significand_bits)
      exponent = exponent_bits.to_i

      # Is the input a normal number?
      if exponent != 0
        exponent += ImplInfo::EXPONENT_BIAS - ImplInfo::SIGNIFICAND_BITS

        # Shorter interval case; proceed like Schubfach.
        # One might think this condition is wrong, since when exponent_bits == 1
        # and two_fc == 0, the interval is actually regular. However, it turns out
        # that this seemingly wrong condition is actually fine, because the end
        # result is anyway the same.
        #
        # [binary32]
        # (fc-1/2) * 2^e = 1.175'494'28... * 10^-38
        # (fc-1/4) * 2^e = 1.175'494'31... * 10^-38
        #    fc    * 2^e = 1.175'494'35... * 10^-38
        # (fc+1/2) * 2^e = 1.175'494'42... * 10^-38
        #
        # Hence, shorter_interval_case will return 1.175'494'4 * 10^-38.
        # 1.175'494'3 * 10^-38 is also a correct shortest representation that will
        # be rejected if we assume shorter interval, but 1.175'494'4 * 10^-38 is
        # closer to the true value so it doesn't matter.
        #
        # [binary64]
        # (fc-1/2) * 2^e = 2.225'073'858'507'201'13... * 10^-308
        # (fc-1/4) * 2^e = 2.225'073'858'507'201'25... * 10^-308
        #    fc    * 2^e = 2.225'073'858'507'201'38... * 10^-308
        # (fc+1/2) * 2^e = 2.225'073'858'507'201'63... * 10^-308
        #
        # Hence, shorter_interval_case will return 2.225'073'858'507'201'4 * 10^-308.
        # This is indeed of the shortest length, and it is the unique one
        # closest to the true value among valid representations of the same length.
        return compute_nearest_shorter(exponent) if two_fc == 0

        two_fc |= two_fc.class.new(1) << (ImplInfo::SIGNIFICAND_BITS + 1)
      else # Is the input a subnormal number?
        exponent = ImplInfo::MIN_EXPONENT - ImplInfo::SIGNIFICAND_BITS
      end

      compute_nearest_normal(two_fc, exponent, signed_significand_bits % 2 == 0)
    end
  end

  {% for f, uint in {Float32 => UInt32, Float64 => UInt64} %}
    # Provides a decimal representation of *x*.
    #
    # Returns a `Tuple` of `{significand, decimal_exponent}` such that
    # `x == significand * 10.0 ** decimal_exponent`. This decimal representation
    # is the shortest possible while still maintaining the round-trip guarantee.
    # There may be trailing zeros in `significand`.
    def self.to_decimal(x : {{ f }}) : Tuple({{ uint }}, Int32)
      br = x.unsafe_as({{ uint }})
      exponent_bits = ImplInfo_{{ f }}.extract_exponent_bits(br)
      s = ImplInfo_{{ f }}.remove_exponent_bits(br, exponent_bits)
      Impl({{ f }}, ImplInfo_{{ f }}).to_decimal(s, exponent_bits)
    end
  {% end %}
end

require "./dragonbox_cache"