VALID_UTF8_BYTE_SEQUENCES = { [0x00] => '\u{0000}', [0x7F] => '\u{007F}', [0xC2, 0x80] => '\u{0080}', [0xC2, 0xBF] => '\u{00BF}', [0xDF, 0x80] => '\u{07C0}', [0xDF, 0xBF] => '\u{07FF}', [0xE0, 0xA0, 0x80] => '\u{0800}', [0xE0, 0xA0, 0xBF] => '\u{083F}', [0xE0, 0xBF, 0x80] => '\u{0FC0}', [0xE0, 0xBF, 0xBF] => '\u{0FFF}', [0xE1, 0x80, 0x80] => '\u{1000}', [0xE1, 0x80, 0xBF] => '\u{103F}', [0xE1, 0x9F, 0x80] => '\u{17C0}', [0xE1, 0x9F, 0xBF] => '\u{17FF}', [0xE1, 0xA0, 0x80] => '\u{1800}', [0xE1, 0xA0, 0xBF] => '\u{183F}', [0xE1, 0xBF, 0x80] => '\u{1FC0}', [0xE1, 0xBF, 0xBF] => '\u{1FFF}', [0xED, 0x80, 0x80] => '\u{D000}', [0xED, 0x80, 0xBF] => '\u{D03F}', [0xED, 0x9F, 0x80] => '\u{D7C0}', [0xED, 0x9F, 0xBF] => '\u{D7FF}', [0xEF, 0x80, 0x80] => '\u{F000}', [0xEF, 0x80, 0xBF] => '\u{F03F}', [0xEF, 0x9F, 0x80] => '\u{F7C0}', [0xEF, 0x9F, 0xBF] => '\u{F7FF}', [0xEF, 0xA0, 0x80] => '\u{F800}', [0xEF, 0xA0, 0xBF] => '\u{F83F}', [0xEF, 0xBF, 0x80] => '\u{FFC0}', [0xEF, 0xBF, 0xBF] => '\u{FFFF}', [0xF0, 0x90, 0x80, 0x80] => '\u{10000}', [0xF0, 0x90, 0x80, 0xBF] => '\u{1003F}', [0xF0, 0x90, 0xBF, 0x80] => '\u{10FC0}', [0xF0, 0x90, 0xBF, 0xBF] => '\u{10FFF}', [0xF0, 0xBF, 0x80, 0x80] => '\u{3F000}', [0xF0, 0xBF, 0x80, 0xBF] => '\u{3F03F}', [0xF0, 0xBF, 0xBF, 0x80] => '\u{3FFC0}', [0xF0, 0xBF, 0xBF, 0xBF] => '\u{3FFFF}', [0xF1, 0x80, 0x80, 0x80] => '\u{40000}', [0xF1, 0x80, 0x80, 0xBF] => '\u{4003F}', [0xF1, 0x80, 0xBF, 0x80] => '\u{40FC0}', [0xF1, 0x80, 0xBF, 0xBF] => '\u{40FFF}', [0xF1, 0x8F, 0x80, 0x80] => '\u{4F000}', [0xF1, 0x8F, 0x80, 0xBF] => '\u{4F03F}', [0xF1, 0x8F, 0xBF, 0x80] => '\u{4FFC0}', [0xF1, 0x8F, 0xBF, 0xBF] => '\u{4FFFF}', [0xF3, 0x90, 0x80, 0x80] => '\u{D0000}', [0xF3, 0xBF, 0xBF, 0xBF] => '\u{FFFFF}', [0xF4, 0x80, 0x80, 0x80] => '\u{100000}', [0xF4, 0x80, 0x80, 0xBF] => '\u{10003F}', [0xF4, 0x80, 0xBF, 0x80] => '\u{100FC0}', [0xF4, 0x80, 0xBF, 0xBF] => '\u{100FFF}', [0xF4, 0x8F, 0x80, 0x80] => '\u{10F000}', [0xF4, 0x8F, 0xBF, 0xBF] => '\u{10FFFF}', } INVALID_UTF8_BYTE_SEQUENCES = [ # non-starters [0x80], [0x8F], [0x90], [0x9F], [0xA0], [0xAF], [0xB0], [0xBF], [0xFE], [0xFF], # incomplete, 2-byte [0xC2], [0xC2, 0x00], [0xC2, 0xC2], # overlong, 2-byte [0xC0, 0x80], [0xC1, 0xBF], # incomplete, 3-byte [0xE1], [0xE1, 0x00], [0xE1, 0xC2], [0xE1, 0x80], [0xE1, 0x80, 0x00], [0xE1, 0x80, 0xC2], # overlong, 3-byte [0xE0, 0x80, 0x80], [0xE0, 0x9F, 0xBF], # surrogate pairs [0xED, 0xA0, 0x80], [0xED, 0xBF, 0xBF], # incomplete, 4-byte [0xF1], [0xF1, 0x00], [0xF1, 0xC2], [0xF1, 0x80], [0xF1, 0x80, 0x00], [0xF1, 0x80, 0xC2], [0xF1, 0x80, 0x80], [0xF1, 0x80, 0x80, 0x00], [0xF1, 0x80, 0x80, 0xC2], # overlong, 4-byte [0xF0, 0x80, 0x80, 0x80], [0xF0, 0x8F, 0xBF, 0xBF], # upper boundary, 4-byte [0xF4, 0x90, 0x80, 0x80], [0xF5], # 5-byte (obsolete) [0xF8], [0xF8, 0x80, 0x80, 0x80, 0x80], # 6-byte (obsolete) [0xFC], [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], ]