require "./float_common" module Float::FastFloat # the limb width: we want efficient multiplication of double the bits in # limb, or for 64-bit limbs, at least 64-bit multiplication where we can # extract the high and low parts efficiently. this is every 64-bit # architecture except for sparc, which emulates 128-bit multiplication. # we might have platforms where `CHAR_BIT` is not 8, so let's avoid # doing `8 * sizeof(limb)`. {% if flag?(:bits64) %} alias Limb = UInt64 LIMB_BITS = 64 {% else %} alias Limb = UInt32 LIMB_BITS = 32 {% end %} alias LimbSpan = Slice(Limb) # number of bits in a bigint. this needs to be at least the number # of bits required to store the largest bigint, which is # `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or # ~3600 bits, so we round to 4000. BIGINT_BITS = 4000 {% begin %} BIGINT_LIMBS = {{ BIGINT_BITS // LIMB_BITS }} {% end %} # vector-like type that is allocated on the stack. the entire # buffer is pre-allocated, and only the length changes. # NOTE(crystal): Deviates a lot from the original implementation to reuse # `Indexable` as much as possible. Contrast with `Crystal::SmallDeque` and # `Crystal::Tracing::BufferIO` struct Stackvec(Size) include Indexable::Mutable(Limb) @data = uninitialized Limb[Size] # we never need more than 150 limbs @length = 0_u16 def unsafe_fetch(index : Int) : Limb @data.to_unsafe[index] end def unsafe_put(index : Int, value : Limb) : Limb @data.to_unsafe[index] = value end def size : Int32 @length.to_i32! end def to_unsafe : Limb* @data.to_unsafe end def to_slice : LimbSpan LimbSpan.new(@data.to_unsafe, @length) end def initialize end # create stack vector from existing limb span. def initialize(s : LimbSpan) try_extend(s) end # index from the end of the container def rindex(index : Int) : Limb rindex = @length &- index &- 1 @data.to_unsafe[rindex] end # set the length, without bounds checking. def size=(@length : UInt16) : UInt16 length end def capacity : Int32 Size.to_i32! end # append item to vector, without bounds checking. def push_unchecked(value : Limb) : Nil @data.to_unsafe[@length] = value @length &+= 1 end # append item to vector, returning if item was added def try_push(value : Limb) : Bool if size < capacity push_unchecked(value) true else false end end # add items to the vector, from a span, without bounds checking def extend_unchecked(s : LimbSpan) : Nil ptr = @data.to_unsafe + @length s.to_unsafe.copy_to(ptr, s.size) @length &+= s.size end # try to add items to the vector, returning if items were added def try_extend(s : LimbSpan) : Bool if size &+ s.size <= capacity extend_unchecked(s) true else false end end # resize the vector, without bounds checking # if the new size is longer than the vector, assign value to each # appended item. def resize_unchecked(new_len : UInt16, value : Limb) : Nil if new_len > @length count = new_len &- @length first = @data.to_unsafe + @length count.times { |i| first[i] = value } @length = new_len else @length = new_len end end # try to resize the vector, returning if the vector was resized. def try_resize(new_len : UInt16, value : Limb) : Bool if new_len > capacity false else resize_unchecked(new_len, value) true end end # check if any limbs are non-zero after the given index. # this needs to be done in reverse order, since the index # is relative to the most significant limbs. def nonzero?(index : Int) : Bool while index < size if rindex(index) != 0 return true end index &+= 1 end false end # normalize the big integer, so most-significant zero limbs are removed. def normalize : Nil while @length > 0 && rindex(0) == 0 @length &-= 1 end end end # NOTE(crystal): returns also *truncated* by value (ditto below) def self.empty_hi64 : {UInt64, Bool} truncated = false {0_u64, truncated} end def self.uint64_hi64(r0 : UInt64) : {UInt64, Bool} truncated = false shl = r0.leading_zeros_count {r0.unsafe_shl(shl), truncated} end def self.uint64_hi64(r0 : UInt64, r1 : UInt64) : {UInt64, Bool} shl = r0.leading_zeros_count if shl == 0 truncated = r1 != 0 {r0, truncated} else shr = 64 &- shl truncated = r1.unsafe_shl(shl) != 0 {r0.unsafe_shl(shl) | r1.unsafe_shr(shr), truncated} end end def self.uint32_hi64(r0 : UInt32) : {UInt64, Bool} uint64_hi64(r0.to_u64!) end def self.uint32_hi64(r0 : UInt32, r1 : UInt32) : {UInt64, Bool} x0 = r0.to_u64! x1 = r1.to_u64! uint64_hi64(x0.unsafe_shl(32) | x1) end def self.uint32_hi64(r0 : UInt32, r1 : UInt32, r2 : UInt32) : {UInt64, Bool} x0 = r0.to_u64! x1 = r1.to_u64! x2 = r2.to_u64! uint64_hi64(x0, x1.unsafe_shl(32) | x2) end # add two small integers, checking for overflow. # we want an efficient operation. # NOTE(crystal): returns also *overflow* by value def self.scalar_add(x : Limb, y : Limb) : {Limb, Bool} z = x &+ y overflow = z < x {z, overflow} end # multiply two small integers, getting both the high and low bits. # NOTE(crystal): passes *carry* in and out by value def self.scalar_mul(x : Limb, y : Limb, carry : Limb) : {Limb, Limb} {% if Limb == UInt64 %} z = x.to_u128! &* y.to_u128! &+ carry carry = z.unsafe_shr(LIMB_BITS).to_u64! {z.to_u64!, carry} {% else %} z = x.to_u64! &* y.to_u64! &+ carry carry = z.unsafe_shr(LIMB_BITS).to_u32! {z.to_u32!, carry} {% end %} end # add scalar value to bigint starting from offset. # used in grade school multiplication def self.small_add_from(vec : Stackvec(Size)*, y : Limb, start : Int) : Bool forall Size index = start carry = y while carry != 0 && index < vec.value.size x, overflow = scalar_add(vec.value.unsafe_fetch(index), carry) vec.value.unsafe_put(index, x) carry = Limb.new!(overflow ? 1 : 0) index &+= 1 end if carry != 0 fastfloat_try vec.value.try_push(carry) end true end # add scalar value to bigint. def self.small_add(vec : Stackvec(Size)*, y : Limb) : Bool forall Size small_add_from(vec, y, 0) end # multiply bigint by scalar value. def self.small_mul(vec : Stackvec(Size)*, y : Limb) : Bool forall Size carry = Limb.zero i = 0 while i < vec.value.size xi = vec.value.unsafe_fetch(i) z, carry = scalar_mul(xi, y, carry) vec.value.unsafe_put(i, z) i &+= 1 end if carry != 0 fastfloat_try vec.value.try_push(carry) end true end # add bigint to bigint starting from index. # used in grade school multiplication def self.large_add_from(x : Stackvec(Size)*, y : LimbSpan, start : Int) : Bool forall Size # the effective x buffer is from `xstart..x.len()`, so exit early # if we can't get that current range. if x.value.size < start || y.size > x.value.size &- start fastfloat_try x.value.try_resize((y.size &+ start).to_u16!, 0) end carry = false index = 0 while index < y.size xi = x.value.unsafe_fetch(index &+ start) yi = y.unsafe_fetch(index) c2 = false xi, c1 = scalar_add(xi, yi) if carry xi, c2 = scalar_add(xi, 1) end x.value.unsafe_put(index &+ start, xi) carry = c1 || c2 index &+= 1 end # handle overflow if carry fastfloat_try small_add_from(x, 1, y.size &+ start) end true end # add bigint to bigint. def self.large_add_from(x : Stackvec(Size)*, y : LimbSpan) : Bool forall Size large_add_from(x, y, 0) end # grade-school multiplication algorithm def self.long_mul(x : Stackvec(Size)*, y : LimbSpan) : Bool forall Size xs = x.value.to_slice z = Stackvec(Size).new(xs) zs = z.to_slice if y.size != 0 y0 = y.unsafe_fetch(0) fastfloat_try small_mul(x, y0) (1...y.size).each do |index| yi = y.unsafe_fetch(index) zi = Stackvec(Size).new if yi != 0 # re-use the same buffer throughout zi.size = 0 fastfloat_try zi.try_extend(zs) fastfloat_try small_mul(pointerof(zi), yi) zis = zi.to_slice fastfloat_try large_add_from(x, zis, index) end end end x.value.normalize true end # grade-school multiplication algorithm def self.large_mul(x : Stackvec(Size)*, y : LimbSpan) : Bool forall Size if y.size == 1 fastfloat_try small_mul(x, y.unsafe_fetch(0)) else fastfloat_try long_mul(x, y) end true end module Pow5Tables LARGE_STEP = 135_u32 {% if compare_versions(Crystal::VERSION, "1.16.0") < 0 %} SMALL_POWER_OF_5 = [ 1_u64, 5_u64, 25_u64, 125_u64, 625_u64, 3125_u64, 15625_u64, 78125_u64, 390625_u64, 1953125_u64, 9765625_u64, 48828125_u64, 244140625_u64, 1220703125_u64, 6103515625_u64, 30517578125_u64, 152587890625_u64, 762939453125_u64, 3814697265625_u64, 19073486328125_u64, 95367431640625_u64, 476837158203125_u64, 2384185791015625_u64, 11920928955078125_u64, 59604644775390625_u64, 298023223876953125_u64, 1490116119384765625_u64, 7450580596923828125_u64, ] {% if Limb == UInt64 %} LARGE_POWER_OF_5 = Slice[ 1414648277510068013_u64, 9180637584431281687_u64, 4539964771860779200_u64, 10482974169319127550_u64, 198276706040285095_u64, ] {% else %} LARGE_POWER_OF_5 = Slice[ 4279965485_u32, 329373468_u32, 4020270615_u32, 2137533757_u32, 4287402176_u32, 1057042919_u32, 1071430142_u32, 2440757623_u32, 381945767_u32, 46164893_u32, ] {% end %} {% else %} SMALL_POWER_OF_5 = Slice(UInt64).literal( 1_u64, 5_u64, 25_u64, 125_u64, 625_u64, 3125_u64, 15625_u64, 78125_u64, 390625_u64, 1953125_u64, 9765625_u64, 48828125_u64, 244140625_u64, 1220703125_u64, 6103515625_u64, 30517578125_u64, 152587890625_u64, 762939453125_u64, 3814697265625_u64, 19073486328125_u64, 95367431640625_u64, 476837158203125_u64, 2384185791015625_u64, 11920928955078125_u64, 59604644775390625_u64, 298023223876953125_u64, 1490116119384765625_u64, 7450580596923828125_u64, ) {% if Limb == UInt64 %} LARGE_POWER_OF_5 = Slice(UInt64).literal( 1414648277510068013_u64, 9180637584431281687_u64, 4539964771860779200_u64, 10482974169319127550_u64, 198276706040285095_u64, ) {% else %} LARGE_POWER_OF_5 = Slice(UInt32).literal( 4279965485_u32, 329373468_u32, 4020270615_u32, 2137533757_u32, 4287402176_u32, 1057042919_u32, 1071430142_u32, 2440757623_u32, 381945767_u32, 46164893_u32, ) {% end %} {% end %} end # big integer type. implements a small subset of big integer # arithmetic, using simple algorithms since asymptotically # faster algorithms are slower for a small number of limbs. # all operations assume the big-integer is normalized. # NOTE(crystal): contrast with ::BigInt struct Bigint # storage of the limbs, in little-endian order. @vec = Stackvec(BIGINT_LIMBS).new def initialize end def initialize(value : UInt64) {% if Limb == UInt64 %} @vec.push_unchecked(value) {% else %} @vec.push_unchecked(value.to_u32!) @vec.push_unchecked(value.unsafe_shr(32).to_u32!) {% end %} @vec.normalize end # get the high 64 bits from the vector, and if bits were truncated. # this is to get the significant digits for the float. # NOTE(crystal): returns also *truncated* by value def hi64 : {UInt64, Bool} {% if Limb == UInt64 %} if @vec.empty? FastFloat.empty_hi64 elsif @vec.size == 1 FastFloat.uint64_hi64(@vec.rindex(0)) else result, truncated = FastFloat.uint64_hi64(@vec.rindex(0), @vec.rindex(1)) truncated ||= @vec.nonzero?(2) {result, truncated} end {% else %} if @vec.empty? FastFloat.empty_hi64 elsif @vec.size == 1 FastFloat.uint32_hi64(@vec.rindex(0)) elsif @vec.size == 2 FastFloat.uint32_hi64(@vec.rindex(0), @vec.rindex(1)) else result, truncated = FastFloat.uint32_hi64(@vec.rindex(0), @vec.rindex(1), @vec.rindex(2)) truncated ||= @vec.nonzero?(3) {result, truncated} end {% end %} end # compare two big integers, returning the large value. # assumes both are normalized. if the return value is # negative, other is larger, if the return value is # positive, this is larger, otherwise they are equal. # the limbs are stored in little-endian order, so we # must compare the limbs in ever order. def compare(other : Bigint*) : Int32 if @vec.size > other.value.@vec.size 1 elsif @vec.size < other.value.@vec.size -1 else index = @vec.size while index > 0 xi = @vec.unsafe_fetch(index &- 1) yi = other.value.@vec.unsafe_fetch(index &- 1) if xi > yi return 1 elsif xi < yi return -1 end index &-= 1 end 0 end end # shift left each limb n bits, carrying over to the new limb # returns true if we were able to shift all the digits. def shl_bits(n : Int) : Bool # Internally, for each item, we shift left by n, and add the previous # right shifted limb-bits. # For example, we transform (for u8) shifted left 2, to: # b10100100 b01000010 # b10 b10010001 b00001000 shl = n shr = LIMB_BITS &- n prev = Limb.zero index = 0 while index < @vec.size xi = @vec.unsafe_fetch(index) @vec.unsafe_put(index, xi.unsafe_shl(shl) | prev.unsafe_shr(shr)) prev = xi index &+= 1 end carry = prev.unsafe_shr(shr) if carry != 0 return @vec.try_push(carry) end true end # move the limbs left by `n` limbs. def shl_limbs(n : Int) : Bool if n &+ @vec.size > @vec.capacity false elsif !@vec.empty? # move limbs dst = @vec.to_unsafe + n src = @vec.to_unsafe src.move_to(dst, @vec.size) # fill in empty limbs first = @vec.to_unsafe n.times { |i| first[i] = 0 } @vec.size = (@vec.size &+ n).to_u16! true else true end end # move the limbs left by `n` bits. def shl(n : Int) : Bool rem = n.unsafe_mod(LIMB_BITS) div = n.unsafe_div(LIMB_BITS) if rem != 0 FastFloat.fastfloat_try shl_bits(rem) end if div != 0 FastFloat.fastfloat_try shl_limbs(div) end true end # get the number of leading zeros in the bigint. def ctlz : Int32 if @vec.empty? 0 else @vec.rindex(0).leading_zeros_count.to_i32! end end # get the number of bits in the bigint. def bit_length : Int32 lz = ctlz (LIMB_BITS &* @vec.size &- lz).to_i32! end def mul(y : Limb) : Bool FastFloat.small_mul(pointerof(@vec), y) end def add(y : Limb) : Bool FastFloat.small_add(pointerof(@vec), y) end # multiply as if by 2 raised to a power. def pow2(exp : UInt32) : Bool shl(exp) end # multiply as if by 5 raised to a power. def pow5(exp : UInt32) : Bool # multiply by a power of 5 large = Pow5Tables::LARGE_POWER_OF_5 while exp >= Pow5Tables::LARGE_STEP FastFloat.fastfloat_try FastFloat.large_mul(pointerof(@vec), large) exp &-= Pow5Tables::LARGE_STEP end small_step = {{ Limb == UInt64 ? 27_u32 : 13_u32 }} max_native = {{ Limb == UInt64 ? 7450580596923828125_u64 : 1220703125_u32 }} while exp >= small_step FastFloat.fastfloat_try FastFloat.small_mul(pointerof(@vec), max_native) exp &-= small_step end if exp != 0 FastFloat.fastfloat_try FastFloat.small_mul(pointerof(@vec), Limb.new!(Pow5Tables::SMALL_POWER_OF_5.unsafe_fetch(exp))) end true end # multiply as if by 10 raised to a power. def pow10(exp : UInt32) : Bool FastFloat.fastfloat_try pow5(exp) pow2(exp) end end end