| // Copyright (c) 2020, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| import 'dart:convert'; |
| import 'dart:typed_data'; |
| |
| /// The ISO-8859-2/Latin-2 (Eastern European) code page. |
| final CodePage latin2 = |
| CodePage._bmp('latin-2', '$_ascii$_noControls$_top8859_2'); |
| |
| /// The ISO-8859-3/Latin-3 (South European) code page. |
| final CodePage latin3 = |
| CodePage._bmp('latin-3', '$_ascii$_noControls$_top8859_3'); |
| |
| /// The ISO-8859-4/Latin-4 (North European) code page. |
| final CodePage latin4 = |
| CodePage._bmp('latin-4', '$_ascii$_noControls$_top8859_4'); |
| |
| /// The ISO-8859-5/Latin-Cyrillic code page. |
| final CodePage latinCyrillic = |
| CodePage._bmp('cyrillic', '$_ascii$_noControls$_top8859_5'); |
| |
| /// The ISO-8859-6/Latin-Arabic code page. |
| final CodePage latinArabic = |
| CodePage._bmp('arabic', '$_ascii$_noControls$_top8859_6'); |
| |
| /// The ISO-8859-7/Latin-Greek code page. |
| final CodePage latinGreek = |
| CodePage._bmp('greek', '$_ascii$_noControls$_top8859_7'); |
| |
| /// The ISO-8859-7/Latin-Hebrew code page. |
| final CodePage latinHebrew = |
| CodePage._bmp('hebrew', '$_ascii$_noControls$_top8859_8'); |
| |
| /// The ISO-8859-9/Latin-5 (Turkish) code page. |
| final CodePage latin5 = |
| CodePage._bmp('latin-5', '$_ascii$_noControls$_top8859_9'); |
| |
| /// The ISO-8859-10/Latin-6 (Nordic) code page. |
| final CodePage latin6 = |
| CodePage._bmp('latin-6', '$_ascii$_noControls$_top8859_10'); |
| |
| /// The ISO-8859-11/Latin-Thai code page. |
| final CodePage latinThai = |
| CodePage._bmp('tis620', '$_ascii$_noControls$_top8859_11'); |
| |
| /// The ISO-8859-13/Latin-6 (Baltic Rim) code page. |
| final CodePage latin7 = |
| CodePage._bmp('latin-7', '$_ascii$_noControls$_top8859_13'); |
| |
| /// The ISO-8859-14/Latin-8 (Celtic) code page. |
| final CodePage latin8 = |
| CodePage._bmp('latin-8', '$_ascii$_noControls$_top8859_14'); |
| |
| /// The ISO-8859-15/Latin-9 (Western European revised) code page. |
| final CodePage latin9 = |
| CodePage._bmp('latin-9', '$_ascii$_noControls$_top8859_15'); |
| |
| /// The ISO-8859-16/Latin-10 (South Eastern European) code page. |
| final CodePage latin10 = |
| CodePage._bmp('latin-10', '$_ascii$_noControls$_top8859_16'); |
| |
| /// Characters in ISO-8859-2 above the ASCII and top control characters. |
| const _top8859_2 = '\xa0Ą˘Ł¤ĽŚ§¨ŠŞŤŹ\xadŽŻ°ą˛ł´ľśˇ¸šşťź˝žż' |
| 'ŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢß' |
| 'ŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙'; |
| |
| /// Characters in ISO-8859-3 above the ASCII and top control characters. |
| const _top8859_3 = '\xa0Ħ˘£\uFFFD¤Ĥ§¨İŞĞĴ\xad\uFFFDŻ°ħ²³´µĥ·¸ışğĵ½\uFFFDż' |
| 'ÀÁÂ\uFFFDÄĊĈÇÈÉÊËÌÍÎÏ\uFFFDÑÒÓÔĠÖ×ĜÙÚÛÜŬŜß' |
| 'àáâ\uFFFDäċĉçèéêëìíîï\uFFFDñòóôġö÷ĝùúûüŭŝ˙'; |
| |
| /// Characters in ISO-8859-4 above the ASCII and top control characters. |
| const _top8859_4 = '\xa0ĄĸŖ¤ĨĻ§¨ŠĒĢŦ\xadŽ¯°ą˛ŗ´ĩļˇ¸šēģŧŊžŋ' |
| 'ĀÁÂÃÄÅÆĮČÉĘËĖÍÎĪĐŅŌĶÔÕÖ×ØŲÚÛÜŨŪß' |
| 'āáâãäåæįčéęëėíîīđņōķôõö÷øųúûüũū˙'; |
| |
| /// Characters in ISO-8859-5 above the ASCII and top control characters. |
| const _top8859_5 = '\xa0ЁЂЃЄЅІЇЈЉЊЋЌ\xadЎЏАБВГДЕЖЗИЙКЛМНОП' |
| 'РСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмноп' |
| 'рстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ'; |
| |
| /// Characters in ISO-8859-6 above the ASCII and top control characters. |
| const _top8859_6 = '\xa0\uFFFD\uFFFD\uFFFD¤\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\u060c\xad\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\u061b\uFFFD\uFFFD\uFFFD\u061f' |
| '\uFFFD\u0621\u0622\u0623\u0624\u0625\u0626\u0627' |
| '\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f' |
| '\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637' |
| '\u0638\u0639\u063a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647' |
| '\u0648\u0649\u064a\u064b\u064c\u064d\u064e\u064f' |
| '\u0650\u0651\u0652\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD'; |
| |
| /// Characters in ISO-8859-7 above the ASCII and top control characters. |
| const _top8859_7 = '\xa0‘’£€₯¦§¨©ͺ«¬\xad\uFFFD―°±²³΄΅Ά·ΈΉΊ»Ό½ΎΏ' |
| 'ΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ\uFFFDΣΤΥΦΧΨΩΪΫάέήί' |
| 'ΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ\uFFFD'; |
| |
| /// Characters in ISO-8859-8 above the ASCII and top control characters. |
| const _top8859_8 = '\xa0\uFFFD¢£¤¥¦§¨©×«¬\xad®¯°±²³´µ¶·¸¹÷»¼½¾\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD‗' |
| '\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7' |
| '\u05d8\u05d9\u05da\u05db\u05dc\u05dd\u05de\u05df' |
| '\u05e0\u05e1\u05e2\u05e3\u05e4\u05e5\u05e6\u05e7' |
| '\u05e8\u05e9\u05ea\uFFFD\uFFFD\u200e\u200f\uFFFD'; |
| |
| /// Characters in ISO-8859-9 above the ASCII and top control characters. |
| const _top8859_9 = '\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿' |
| 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖ×ØÙÚÛÜİŞß' |
| 'àáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ'; |
| |
| /// Characters in ISO-8859-10 above the ASCII and top control characters. |
| const _top8859_10 = '\xa0ĄĒĢĪĨĶ§ĻĐŠŦŽ\xadŪŊ°ąēģīĩķ·ļđšŧž―ūŋ' |
| 'ĀÁÂÃÄÅÆĮČÉĘËĖÍÎÏÐŅŌÓÔÕÖŨØŲÚÛÜÝÞß' |
| 'āáâãäåæįčéęëėíîïðņōóôõöũøųúûüýþĸ'; |
| |
| /// Characters in ISO-8859-11 above the ASCII and top control characters. |
| const _top8859_11 = '\xa0กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟ' |
| 'ภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู\uFFFD\uFFFD\uFFFD\uFFFD฿' |
| 'เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛\uFFFD\uFFFD\uFFFD\uFFFD'; |
| |
| /// Characters in ISO-8859-13 above the ASCII and top control characters. |
| const _top8859_13 = '\xa0”¢£¤„¦§Ø©Ŗ«¬\xad®Æ°±²³“µ¶·ø¹ŗ»¼½¾æ' |
| 'ĄĮĀĆÄÅĘĒČÉŹĖĢĶĪĻŠŃŅÓŌÕÖ×ŲŁŚŪÜŻŽß' |
| 'ąįāćäåęēčéźėģķīļšńņóōõö÷ųłśūüżž’'; |
| |
| /// Characters in ISO-8859-14 above the ASCII and top control characters. |
| const _top8859_14 = '\xa0Ḃḃ£ĊċḊ§Ẁ©ẂḋỲ\xad®ŸḞḟĠġṀṁ¶ṖẁṗẃṠỳẄẅṡ' |
| 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏŴÑÒÓÔÕÖṪØÙÚÛÜÝŶß' |
| 'àáâãäåæçèéêëìíîïŵñòóôõöṫøùúûüýŷÿ'; |
| |
| /// Characters in ISO-8859-15 above the ASCII and top control characters. |
| const _top8859_15 = '\xa0¡¢£€¥Š§š©ª«¬\xad®¯°±²³Žµ¶·ž¹º»ŒœŸ¿' |
| 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß' |
| 'àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'; |
| |
| /// Characters in ISO-8859-16 above the ASCII and top control characters. |
| const _top8859_16 = '\xa0ĄąŁ€„Š§š©Ș«Ź\xadźŻ°±ČłŽ”¶·žčș»ŒœŸż' |
| 'ÀÁÂĂÄĆÆÇÈÉÊËÌÍÎÏĐŃÒÓÔŐÖŚŰÙÚÛÜĘȚß' |
| 'àáâăäćæçèéêëìíîïđńòóôőöśűùúûüęțÿ'; |
| |
| const _noControls = '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' |
| '\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD'; |
| |
| /// ASCII characters without control characters. Shared by many code pages. |
| const _ascii = '$_noControls' |
| // ignore: missing_whitespace_between_adjacent_strings |
| r""" !"#$%&'()*+,-./0123456789:;<=>?""" |
| r'@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_' |
| '`abcdefghijklmnopqrstuvwxyz{|}~\uFFFD'; |
| |
| /// A mapping between bytes and characters. |
| /// |
| /// A code page is a way to map bytes to character. |
| /// As such, it can only represent 256 different characters. |
| class CodePage extends Encoding { |
| @override |
| final CodePageDecoder decoder; |
| @override |
| final String name; |
| CodePageEncoder? _encoder; |
| |
| /// Creates a code page with the given name and characters. |
| /// |
| /// The [characters] string must contain 256 code points (runes) |
| /// in the order of the bytes representing them. |
| /// |
| /// Any byte not defined by the code page should have a |
| /// U+FFFD (invalid character) code point at its place in |
| /// [characters]. |
| /// |
| /// The name is used by [Encoding.name]. |
| factory CodePage(String name, String characters) = CodePage._general; |
| |
| /// Creates a code page with the characters of [characters]. |
| /// |
| /// The [characters] must contain precisely 256 characters (code points). |
| /// |
| /// A U+FFFD (invalid character) entry in [characters] means that the |
| /// corresponding byte does not have a definition in this code page. |
| CodePage._general(this.name, String characters) |
| : decoder = _createDecoder(characters); |
| |
| /// Creates a code page with characters from the basic multilingual plane. |
| /// |
| /// The basic multilingual plane (BMP) contains the first 65536 code points. |
| /// As such, each character can be represented by a single UTF-16 code unit, |
| /// which makes some operations more efficient. |
| /// |
| /// The [characters] must contain precisely 256 code points from the BMP |
| /// which means that it should have length 256 and not contain any surrogates. |
| /// |
| /// A U+FFFD (invalid character) entry in [characters] means that the |
| /// corresponding byte does not have a definition in this code page. |
| CodePage._bmp(this.name, String characters) |
| : decoder = _BmpCodePageDecoder(characters); |
| |
| /// The character associated with a particular byte in this code page. |
| /// |
| /// The [byte] must be in the range 0..255. |
| /// The returned value should be a Unicode scalar value |
| /// (a non-surrogate code point). |
| /// |
| /// If a code page does not have a defined character for a particular |
| /// byte, it should return the Unicode invalid character (U+FFFD) |
| /// instad. |
| int operator [](int byte) => decoder._char(byte); |
| |
| /// Encodes [input] using `encoder.convert`. |
| @override |
| Uint8List encode(String input, {int? invalidCharacter}) => |
| encoder.convert(input, invalidCharacter: invalidCharacter); |
| |
| /// Decodes [bytes] using `encoder.convert`. |
| @override |
| String decode(List<int> bytes, {bool allowInvalid = false}) => |
| decoder.convert(bytes, allowInvalid: allowInvalid); |
| |
| @override |
| CodePageEncoder get encoder => _encoder ??= decoder._createEncoder(); |
| } |
| |
| /// A code page decoder, converts from bytes to characters. |
| /// |
| /// A code page assigns characters to a subset of byte values. |
| /// The decoder converts those bytes back to their characters. |
| abstract class CodePageDecoder implements Converter<List<int>, String> { |
| /// Decodes a sequence of bytes into a string using a code page. |
| /// |
| /// The code page assigns one character to each byte. |
| /// Values in [input] must be bytes (integers in the range 0..255). |
| /// |
| /// If [allowInvalid] is true, non-byte values in [input], |
| /// or byte values not defined as a character in the code page, |
| /// are emitted as U+FFFD (the Unicode invalid character). |
| /// If not true, the bytes must be calid and defined characters. |
| @override |
| String convert(List<int> input, {bool allowInvalid = false}); |
| |
| CodePageEncoder _createEncoder(); |
| int _char(int byte); |
| } |
| |
| /// Creates a decoder from [characters]. |
| /// |
| /// Recognizes if [characters] contains only characters in the BMP, |
| /// and creates a [_BmpCodePageDecoder] in that case. |
| CodePageDecoder _createDecoder(String characters) { |
| var result = Uint32List(256); |
| var i = 0; |
| var allChars = 0; |
| for (var char in characters.runes) { |
| if (i >= 256) { |
| throw ArgumentError.value( |
| characters, 'characters', 'Must contain 256 characters'); |
| } |
| result[i++] = char; |
| allChars |= char; |
| } |
| if (i < 256) { |
| throw ArgumentError.value( |
| characters, 'characters', 'Must contain 256 characters'); |
| } |
| if (allChars <= 0xFFFF) { |
| // It's in the BMP. |
| return _BmpCodePageDecoder(characters); |
| } |
| return _NonBmpCodePageDecoder._(result); |
| } |
| |
| /// An input [ByteConversionSink] for decoders where each input byte can be be |
| /// considered independantly. |
| class _CodePageDecoderSink extends ByteConversionSink { |
| final Sink<String> _output; |
| final Converter<List<int>, String> _decoder; |
| |
| _CodePageDecoderSink(this._output, this._decoder); |
| |
| @override |
| void add(List<int> chunk) { |
| _output.add(_decoder.convert(chunk)); |
| } |
| |
| @override |
| void close() { |
| _output.close(); |
| } |
| } |
| |
| /// Code page with non-BMP characters. |
| class _NonBmpCodePageDecoder extends Converter<List<int>, String> |
| implements CodePageDecoder { |
| final Uint32List _characters; |
| _NonBmpCodePageDecoder(String characters) : this._(_buildMapping(characters)); |
| _NonBmpCodePageDecoder._(this._characters); |
| |
| @override |
| int _char(int byte) => _characters[byte]; |
| |
| static Uint32List _buildMapping(String characters) { |
| var result = Uint32List(256); |
| var i = 0; |
| for (var char in characters.runes) { |
| if (i >= 256) { |
| throw ArgumentError.value( |
| characters, 'characters', 'Must contain 256 characters'); |
| } |
| result[i++] = char; |
| } |
| if (i < 256) { |
| throw ArgumentError.value( |
| characters, 'characters', 'Must contain 256 characters'); |
| } |
| return result; |
| } |
| |
| @override |
| CodePageEncoder _createEncoder() { |
| var result = <int, int>{}; |
| for (var i = 0; i < 256; i++) { |
| var char = _characters[i]; |
| if (char != 0xFFFD) { |
| result[char] = i; |
| } |
| } |
| return CodePageEncoder._(result); |
| } |
| |
| @override |
| String convert(List<int> input, {bool allowInvalid = false}) { |
| var buffer = Uint32List(input.length); |
| for (var i = 0; i < input.length; i++) { |
| var byte = input[i]; |
| if (byte & 0xff != byte) throw FormatException('Not a byte', input, i); |
| buffer[i] = _characters[byte]; |
| } |
| return String.fromCharCodes(buffer); |
| } |
| |
| @override |
| Sink<List<int>> startChunkedConversion(Sink<String> sink) => |
| _CodePageDecoderSink(sink, this); |
| } |
| |
| class _BmpCodePageDecoder extends Converter<List<int>, String> |
| implements CodePageDecoder { |
| final String _characters; |
| _BmpCodePageDecoder(String characters) : _characters = characters { |
| if (characters.length != 256) { |
| throw ArgumentError.value(characters, 'characters', |
| 'Must contain 256 characters. Was ${characters.length}'); |
| } |
| } |
| |
| @override |
| int _char(int byte) => _characters.codeUnitAt(byte); |
| |
| @override |
| String convert(List<int> bytes, {bool allowInvalid = false}) { |
| if (allowInvalid) return _convertAllowInvalid(bytes); |
| var count = bytes.length; |
| var codeUnits = Uint16List(count); |
| for (var i = 0; i < count; i++) { |
| var byte = bytes[i]; |
| if (byte != byte & 0xff) { |
| throw FormatException('Not a byte value', bytes, i); |
| } |
| var character = _characters.codeUnitAt(byte); |
| if (character == 0xFFFD) { |
| throw FormatException('Not defined in this code page', bytes, i); |
| } |
| codeUnits[i] = character; |
| } |
| return String.fromCharCodes(codeUnits); |
| } |
| |
| @override |
| Sink<List<int>> startChunkedConversion(Sink<String> sink) => |
| _CodePageDecoderSink(sink, this); |
| |
| String _convertAllowInvalid(List<int> bytes) { |
| var count = bytes.length; |
| var codeUnits = Uint16List(count); |
| for (var i = 0; i < count; i++) { |
| var byte = bytes[i]; |
| int character; |
| if (byte == byte & 0xff) { |
| character = _characters.codeUnitAt(byte); |
| } else { |
| character = 0xFFFD; |
| } |
| codeUnits[i] = character; |
| } |
| return String.fromCharCodes(codeUnits); |
| } |
| |
| @override |
| CodePageEncoder _createEncoder() => CodePageEncoder._bmp(_characters); |
| } |
| |
| /// Encoder for a code page. |
| /// |
| /// Converts a string into bytes where each byte represents that character |
| /// according to the code page definition. |
| class CodePageEncoder extends Converter<String, List<int>> { |
| final Map<int, int> _encoding; |
| |
| CodePageEncoder._bmp(String characters) |
| : _encoding = _createBmpEncoding(characters); |
| |
| CodePageEncoder._(this._encoding); |
| |
| static Map<int, int> _createBmpEncoding(String characters) { |
| var encoding = <int, int>{}; |
| for (var i = 0; i < characters.length; i++) { |
| var char = characters.codeUnitAt(i); |
| if (char != 0xFFFD) encoding[characters.codeUnitAt(i)] = i; |
| } |
| return encoding; |
| } |
| |
| /// Converts input to the byte encoding in this code page. |
| /// |
| /// If [invalidCharacter] is supplied, it must be a byte value |
| /// (in the range 0..255). |
| /// |
| /// If [input] contains characters that are not available |
| /// in this code page, they are replaced by the [invalidCharacter] byte, |
| /// and then [invalidCharacter] must have been supplied. |
| @override |
| Uint8List convert(String input, {int? invalidCharacter}) { |
| if (invalidCharacter != null) { |
| RangeError.checkValueInInterval( |
| invalidCharacter, 0, 255, 'invalidCharacter'); |
| } |
| var count = input.length; |
| var result = Uint8List(count); |
| var j = 0; |
| for (var i = 0; i < count; i++) { |
| var char = input.codeUnitAt(i); |
| var byte = _encoding[char]; |
| nullCheck: |
| if (byte == null) { |
| // Check for surrogate. |
| var offset = i; |
| if (char & 0xFC00 == 0xD800 && i + 1 < count) { |
| var next = input.codeUnitAt(i + 1); |
| if ((next & 0xFC00) == 0xDC00) { |
| i = i + 1; |
| char = 0x10000 + ((char & 0x3ff) << 10) + (next & 0x3ff); |
| byte = _encoding[char]; |
| if (byte != null) break nullCheck; |
| } |
| } |
| byte = invalidCharacter ?? |
| (throw FormatException( |
| 'Not a character in this code page', input, offset)); |
| } |
| result[j++] = byte; |
| } |
| return Uint8List.sublistView(result, 0, j); |
| } |
| } |