class String # Returns the UTF-16 encoding of the given *string*. # # Invalid chars (in the range U+D800..U+DFFF) are encoded with the # unicode replacement char value `0xfffd`. # # The byte following the end of this slice (but not included in it) is defined # to be zero. This allows passing the result of this function into C functions # that expect a null-terminated `UInt16*`. # # ``` # "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16] # ``` def to_utf16 : Slice(UInt16) # size < bytesize, so we need to count the number of characters that are # two UInt16 wide. u16_size = 0 each_char do |char| u16_size += char.ord < 0x1_0000 ? 1 : 2 end # Allocate one extra character for trailing null slice = Slice(UInt16).new(u16_size + 1) appender = slice.to_unsafe.appender each_char do |char| ord = char.ord if ord < 0x1_0000 # One UInt16 is enough appender << ord.to_u16! else # Needs surrogate pair ord &-= 0x1_0000 appender << 0xd800_u16 &+ ((ord >> 10) & 0x3ff) # Keep top 10 bits appender << 0xdc00_u16 &+ (ord & 0x3ff) # Keep low 10 bits end end # Append null byte appender << 0_u16 # The trailing null is not part of the returned slice slice[0, u16_size] end # Decodes the given *slice* UTF-16 sequence into a String. # # Invalid values are encoded using the unicode replacement char with # codepoint `0xfffd`. # # If *truncate_at_null* is true, only the characters up to and not including # the first null character are copied. # # ``` # slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16] # String.from_utf16(slice) # => "hi 𐂥" # # slice = UInt16.slice(102, 111, 111, 0, 98, 97, 114) # String.from_utf16(slice, truncate_at_null: true) # => "foo" # ``` def self.from_utf16(slice : Slice(UInt16), *, truncate_at_null : Bool = false) : String bytesize = 0 size = 0 each_utf16_char(slice, truncate_at_null: truncate_at_null) do |char| bytesize += char.bytesize size += 1 end String.new(bytesize) do |buffer| each_utf16_char(slice, truncate_at_null: truncate_at_null) do |char| char.each_byte do |byte| buffer.value = byte buffer += 1 end end {bytesize, size} end end # Decodes the given *slice* UTF-16 sequence into a String and returns the # pointer after reading. The string ends when a zero value is found. # # ``` # slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16, 0_u16] # String.from_utf16(slice) # => "hi\0000𐂥\u0000" # pointer = slice.to_unsafe # string, pointer = String.from_utf16(pointer) # string # => "hi" # string, pointer = String.from_utf16(pointer) # string # => "𐂥" # ``` # # Invalid values are encoded using the unicode replacement char with # codepoint `0xfffd`. def self.from_utf16(pointer : Pointer(UInt16)) : {String, Pointer(UInt16)} bytesize = 0 size = 0 each_utf16_char(pointer) do |char| bytesize += char.bytesize size += 1 end string = String.new(bytesize) do |buffer| pointer = each_utf16_char(pointer) do |char| char.each_byte do |byte| buffer.value = byte buffer += 1 end end {bytesize, size} end {string, pointer + 1} end # :nodoc: # # Yields each decoded char in the given slice. def self.each_utf16_char(slice : Slice(UInt16), *, truncate_at_null : Bool = false, &) i = 0 while i < slice.size byte = slice[i].to_i break if truncate_at_null && byte == 0 if byte < 0xd800 || byte >= 0xe000 # One byte codepoint = byte elsif byte < 0xdc00 && (i + 1) < slice.size && 0xdc00 <= slice[i + 1] <= 0xdfff # Surrogate pair codepoint = (byte << 10) &+ slice[i + 1] &- 0x35fdc00 i += 1 else # Invalid byte codepoint = 0xfffd end yield codepoint.unsafe_chr i += 1 end end # Yields each decoded char in the given pointer, stopping at the first null byte. private def self.each_utf16_char(pointer : Pointer(UInt16), &) : Pointer(UInt16) loop do byte = pointer.value.to_i break if byte == 0 if byte < 0xd800 || byte >= 0xe000 # One byte codepoint = byte elsif byte < 0xdc00 && 0xdc00 <= (pointer + 1).value <= 0xdfff # Surrogate pair pointer = pointer + 1 codepoint = (byte << 10) &+ pointer.value &- 0x35fdc00 else # Invalid byte codepoint = 0xfffd end yield codepoint.unsafe_chr pointer = pointer + 1 end pointer end end