| // Copyright (c) 2024, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| // ---------------------------------------------------------------------- |
| // Code to create the URI scanner table used by `uri.dart`. |
| // |
| // This file exists in case someone, some day, will want to change the |
| // representation of the tables, maybe if Dart gets `Uint8List` literals. |
| // It should not otherwise be necessary to re-generate the tables. |
| // |
| // The table is stored in the `uri.dart` file as a 1-byte string literal. |
| // This script generates the string literal and prints it on stdout. |
| // If passed the `-u filename` flag, it instead updates the file directly. |
| // The file should be the `sdk/lib/core/uri.dart` file, which contains markers |
| // showing where to insert the generated code. |
| |
| import "dart:convert" show LineSplitter; |
| import "dart:io"; |
| import "dart:typed_data"; |
| |
| // Indices in the position array, where transitions write |
| // their current position. |
| |
| /// Index of the position of that `:` after a scheme. |
| const int _schemeEndIndex = 1; |
| |
| /// Index of the position of the character just before the host name. |
| const int _hostStartIndex = 2; |
| |
| /// Index of the position of the `:` before a port value. |
| const int _portStartIndex = 3; |
| |
| /// Index of the position of the first character of a path. |
| const int _pathStartIndex = 4; |
| |
| /// Index of the position of the `?` before a query. |
| const int _queryStartIndex = 5; |
| |
| /// Index of the position of the `#` before a fragment. |
| const int _fragmentStartIndex = 6; |
| |
| /// Index of a position where the URI was determined to be "non-simple". |
| const int _notSimpleIndex = 7; |
| |
| // Significant states and state related numbers. |
| |
| /// Initial state for scanner. |
| const int _uriStart = 0; |
| |
| /// If scanning of a URI terminates in this state or above, |
| /// consider the URI non-simple |
| const int _nonSimpleEndStates = 14; |
| |
| /// Initial state for scheme validation. |
| const int _schemeStart = 20; |
| |
| /// Number of states total. |
| const int _stateCount = 22; |
| |
| /// Number of bits used to store a state. |
| /// |
| /// Satisfies `1 << stateBits >= _stateCount`. |
| /// Also used as shift for extra information in the transition table. |
| const int _stateBits = 5; |
| |
| /// Mask of low `_stateBits` bits, to extract state from transition table entry. |
| const int _stateMask = (1 << _stateBits) - 1; |
| |
| // Table structure constants. |
| // |
| // The table contains entries only for characters in the range U+0020 to U+007F. |
| // The input characters are permuted to make the lookup easy. |
| |
| /// Input characters are xor'ed with this value. |
| /// |
| /// That puts the range 0x20-0x7f into the range 0x00-0x5F, |
| /// which is easily usable as a an index into a table of length 0x60, |
| /// and checking if the value was originally in the range 0x20-0x7f can |
| /// be done by a single `<= 0x5f` (since the value is a string character unit, |
| /// which is known to be positive). |
| const int _charXor = 0x60; |
| |
| /// Limit of valid characters after xor'ing with the above value. |
| const int _xorCharLimit = 0x5f; |
| |
| void main(List<String> args) { |
| var parserTableText = _createParserTableText(); |
| var charsetTableText = _createCharacterSetText(); |
| if (args.isEmpty || !args.first.startsWith("-u")) { |
| print(parserTableText); |
| print(charsetTableText); |
| return; |
| } |
| var arg = args.first; |
| var filePath = "sdk/lib/core/uri.dart"; |
| // Default file location, if run from root of SDK. |
| if (arg.length > 2) { |
| filePath = arg.substring(2); |
| } else if (args.length > 1) { |
| filePath = args[1]; |
| } |
| var file = File(filePath); |
| if (!file.existsSync()) { |
| stderr.writeln("Cannot find file: $filePath"); |
| exit(1); |
| } |
| var contents = file.readAsStringSync(); |
| |
| // Replace marked range for parser tables. |
| var pattern = RegExp( |
| r"^// --- URI PARSER TABLE --- (start|end) --- [^]*?^", |
| multiLine: true, |
| ); |
| var matches = pattern.allMatches(contents).toList(); |
| if (matches.length != 2) { |
| stderr.writeln("Cannot find marked section in file $filePath"); |
| exit(1); |
| } |
| var start = matches.first.end; |
| var end = matches.last.start; |
| var newContents = contents.replaceRange(start, end, parserTableText); |
| |
| // Replace marked range for character sets. |
| pattern = RegExp( |
| r"^// --- URI CHARSET TABLE --- (start|end) --- [^]*?^", |
| multiLine: true, |
| ); |
| matches = pattern.allMatches(contents).toList(); |
| if (matches.length != 2) { |
| stderr.writeln("Cannot find marked section in file $filePath"); |
| exit(1); |
| } |
| start = matches.first.end; |
| end = matches.last.start; |
| newContents = newContents.replaceRange(start, end, charsetTableText); |
| |
| if (newContents != contents) { |
| file.writeAsStringSync(newContents); |
| print("$filePath updated."); |
| } else { |
| stderr.writeln("No update needed."); |
| return; |
| } |
| } |
| |
| String _createParserTableText() { |
| var tables = _createTables(); |
| var literalBuilder = StringLiteralBuilder("_scannerTables"); |
| for (var table in tables) { |
| literalBuilder.writeBytes(table, hexAll: true); |
| } |
| var tableString = literalBuilder.close(); |
| |
| var result = """ |
| $generatedHeader |
| |
| // -------------------------------------------------------------------- |
| // Constants used to read the scanner result. |
| // The indices points into the table filled by [_scan] which contains |
| // recognized positions in the scanned URI. |
| // The `0` index is only used internally. |
| |
| /// Index of the position of that `:` after a scheme. |
| const int _schemeEndIndex = $_schemeEndIndex; |
| |
| /// Index of the position of the character just before the host name. |
| const int _hostStartIndex = $_hostStartIndex; |
| |
| /// Index of the position of the `:` before a port value. |
| const int _portStartIndex = $_portStartIndex; |
| |
| /// Index of the position of the first character of a path. |
| const int _pathStartIndex = $_pathStartIndex; |
| |
| /// Index of the position of the `?` before a query. |
| const int _queryStartIndex = $_queryStartIndex; |
| |
| /// Index of the position of the `#` before a fragment. |
| const int _fragmentStartIndex = $_fragmentStartIndex; |
| |
| /// Index of a position where the URI was determined to be "non-simple". |
| const int _notSimpleIndex = $_notSimpleIndex; |
| |
| /// Initial state for scanner. |
| const int _uriStart = $_uriStart; |
| |
| /// If scanning of a URI terminates in this state or above, |
| /// consider the URI non-simple |
| const int _nonSimpleEndStates = $_nonSimpleEndStates; |
| |
| /// Initial state for scheme validation. |
| const int _schemeStart = $_schemeStart; |
| |
| // -------------------------------------------------------------------- |
| /// Transition tables are used to scan a URI to determine its structure. |
| /// |
| /// The tables represent a state machine with output. |
| /// |
| /// To scan the URI, start in the [_uriStart] state, then read each character |
| /// of the URI in order, from start to end, and for each character perform a |
| /// transition to a new state while writing the current position |
| /// into the output buffer at a designated index. |
| /// |
| /// Each state, represented by an integer which is an index into |
| /// [_scannerTables], has a set of transitions, one for each character. |
| /// The transitions are encoded as a 5-bit integer representing the next state |
| /// and a 3-bit index into the output table. |
| /// |
| /// For URI scanning, only characters in the range U+0020 through U+007E are |
| /// interesting; all characters outside that range are treated the same. |
| /// The tables only contain 96 entries, representing the 95 characters in the |
| /// interesting range, and one entry for all values outside the range. |
| /// The character entries are stored in one `String` of 96 characters per state, |
| /// with the transition for a character at position `character ^ 0x60`, |
| /// which maps the range U+0020 .. U+007F into positions 0 .. 95. |
| /// All remaining characters are mapped to position 0x1f (`0x7f ^ 0x60`), which |
| /// represents the transition for all remaining characters. |
| $tableString |
| // -------------------------------------------------------------------- |
| /// Scan a string using the [_scannerTables] state machine. |
| /// |
| /// Scans [uri] from [start] to [end], starting in state [state] and |
| /// writing output into [indices]. |
| /// |
| /// Returns the final state. If that state is greater than or equal to |
| /// [_nonSimpleEndStates], the general URI scan should consider the |
| /// result non-simple, even if no position has been written to |
| /// [_notSimpleIndex] of [indices]. |
| int _scan(String uri, int start, int end, int state, List<int> indices) { |
| // Number of characters in table for each state (range 0x20..0x60). |
| const int stateTableSize = 0x60; |
| // Value to xor input character with to make valid range start at zero. |
| const int _charXor = $_charXor; |
| // Limit on valid values after doing xor. |
| const int _xorCharLimit = $_xorCharLimit; |
| // Entry used for invalid input characters (not in the range 0x20-0x7f). |
| const int _invalidChar = 0x7F ^ _charXor; |
| // Shift to extract write position from transition table entry. |
| const int _writeIndexShift = $_stateBits; |
| // Mask for state part of transition table entry. |
| const int _stateMask = $_stateMask; |
| |
| assert(end <= uri.length); |
| for (int i = start; i < end; i++) { |
| int char = uri.codeUnitAt(i) ^ _charXor; |
| if (char > _xorCharLimit) char = _invalidChar; |
| int transition = _scannerTables.codeUnitAt(state * stateTableSize + char); |
| state = transition & _stateMask; |
| indices[transition >> _writeIndexShift] = i; |
| } |
| return state; |
| } |
| """; |
| return result; |
| } |
| |
| String _createCharacterSetText() { |
| var bits = Uint16List(128); |
| var nextBit = 1; |
| var seen = <String, String>{}; |
| var buffer = StringBuffer(generatedHeader); |
| buffer.writeln(); |
| |
| // Generates a documented entry for `${name}Mask` and adds the `chars` |
| // to the `bits` table. |
| // The chars can use `-` for a range of characters, and `\` for |
| // the next character being verbatim (to escape `-` and `\`). |
| void tableEntry(String name, String chars, String doc) { |
| buffer.writeln(); |
| for (var line in LineSplitter.split(doc)) { |
| if (line.isEmpty) { |
| buffer.writeln("//"); |
| } else { |
| buffer |
| ..write('// ') |
| ..writeln(line); |
| } |
| } |
| buffer |
| ..write('const ') |
| ..write(name) |
| ..write('Mask = '); |
| if (seen[chars] case var existingName?) { |
| buffer |
| ..write(existingName) |
| ..write('Mask'); |
| } else { |
| seen[chars] = name; |
| var bit = nextBit; |
| nextBit *= 2; |
| // Previous char emitted. Used to test that strings are ordered, |
| // and as start for writing ranges. |
| var prevChar = -1; |
| for (var i = 0; i < chars.length; i++) { |
| var char = chars.codeUnitAt(i); |
| int? rangeStart; |
| const charDash = 0x2D; // `-` character. |
| const charBackslash = 0x5C; // `;` character. |
| if (char == charDash) { |
| char = chars.codeUnitAt(++i); |
| rangeStart = prevChar + 1; |
| } |
| if (char == charBackslash) { |
| char = chars.codeUnitAt(++i); |
| } |
| if (char <= prevChar) throw FormatException("Not sorted", chars, i); |
| for (var c = rangeStart ?? char; c <= char; c++) { |
| bits[c] |= bit; |
| } |
| prevChar = char; |
| } |
| var hexDigits = bit.toRadixString(16); |
| const zeroPrefix = ['0x', '0x0', '0x00', '0x000']; |
| buffer |
| ..write(zeroPrefix[4 - hexDigits.length]) |
| ..write(hexDigits); |
| } |
| buffer.writeln(';'); |
| } |
| |
| tableEntry("_unreserved", r"\-.0-9A-Z_a-z~", r""" |
| The unreserved characters of RFC 3986. |
| [A-Za-z0-9\-._~] |
| """); |
| tableEntry("_unreserved2396", r"!'()*\-.0-9A-Z_a-z~", r""" |
| The unreserved characters of RFC 2396. |
| [A-Za-z0-9!'()*\-._~] |
| """); |
| tableEntry("_encodeFull", r"!#$&'()*+,\-./0-9:;=?@A-Z_a-z~", r""" |
| Table of reserved characters specified by ECMAScript 5. |
| [A-Za-z0-9!#$&'()*+,\-./:;=?_~] |
| """); |
| tableEntry("_scheme", r"+\-.0-9A-Za-z", r""" |
| Characters allowed in the scheme. |
| [A-Za-z0-9+\-.] |
| """); |
| tableEntry("_userinfo", r"!$&'()*+,\-.0-9:;=A-Z_a-z~", r""" |
| Characters allowed in the userinfo as of RFC 3986. |
| RFC 3986 Appendix A |
| userinfo = *( unreserved / pct-encoded / sub-delims / ':') |
| [A-Za-z0-9!$&'()*+,\-.:;=_~] (without '%') |
| """); |
| tableEntry("_regName", r"!$%&'()*+,\-.0-9;=A-Z_a-z~", r""" |
| Characters allowed in the reg-name as of RFC 3986. |
| RFC 3986 Appendix A |
| reg-name = *( unreserved / pct-encoded / sub-delims ) |
| Same as `_userinfoMask` without the `:`. |
| // [A-Za-z0-9!$%&'()*+,\-.;=_~] (including '%') |
| """); |
| tableEntry("_pathChar", r"!$&'()*+,\-.0-9:;=@A-Z_a-z~", r""" |
| Characters allowed in the path as of RFC 3986. |
| RFC 3986 section 3.3. |
| pchar = unreserved / pct-encoded / sub-delims / ":" / "@" |
| [A-Za-z0-9!$&'()*+,\-.:;=@_~] (without '%') |
| """); |
| tableEntry("_pathCharOrSlash", r"!$&'()*+,\-./0-9:;=@A-Z_a-z~", r""" |
| Characters allowed in the path as of RFC 3986. |
| RFC 3986 section 3.3 *and* slash. |
| [A-Za-z0-9!$&'()*+,\-./:;=@_~] (without '%') |
| """); |
| tableEntry("_queryChar", r"!$&'()*+,\-./0-9:;=?@A-Z_a-z~", r""" |
| Characters allowed in the query as of RFC 3986. |
| RFC 3986 section 3.4. |
| query = *( pchar / "/" / "?" ) |
| [A-Za-z0-9!$&'()*+,\-./:;=?@_~] (without '%') |
| """); |
| tableEntry("_zoneID", r"\-.0-9A-Z_a-z~", r""" |
| Characters allowed in the ZoneID as of RFC 6874. |
| ZoneID = 1*( unreserved / pct-encoded ) |
| [A-Za-z0-9\-._~] + '%' |
| """); |
| tableEntry("_tokenChar", r"!$&'*+\-.0-9A-Z^_`a-z{|}~", r""" |
| Table of the `token` characters of RFC 2045 in a `data:` URI. |
| |
| A token is any US-ASCII character except SPACE, control characters and |
| `tspecial` characters. The `tspecial` category is: |
| '(', ')', '<', '>', '@', ',', ';', ':', '\', '"', '/', '[, ']', '?', '='. |
| |
| In a data URI, we also need to escape '%' and '#' characters. |
| """); |
| tableEntry("_uric", r"!$&'()*+,\-./0-9:;=?@A-Z_a-z~", r""" |
| All non-escape RFC-2396 "uric" characters. |
| |
| The "uric" character set is defined by: |
| ``` |
| uric = reserved | unreserved | escaped |
| reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," |
| unreserved = alphanum | mark |
| mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" |
| ``` |
| This is the same characters as in a URI query (which is URI pchar plus '?') |
| """); |
| tableEntry("_genDelimiters", r"#/:?@[]", r""" |
| General delimiter characters, RFC 3986 section 2.2. |
| gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
| [:/?#[]@] |
| """); |
| tableEntry("_ipvFutureAddressChars", r"!$&'()*+,\-.0-9:;=A-Z_a-z~", r""" |
| Characters valid in an IPvFuture address, RFC 3986 section 3.2.2. |
| 1*( unreserved / sub-delims / ":" ) |
| [A-Za-z0-9\-._~]|[!$&'()*+,;=]|: |
| """); |
| |
| var table = (StringLiteralBuilder('_charTables') |
| ..writeChars(bits, hexAll: true)) |
| .close(); |
| buffer |
| ..writeln() |
| ..write(table); |
| |
| return buffer.toString(); |
| } |
| |
| const String generatedHeader = """ |
| // Use tools/generate_uri_parser_tables.dart to generate this code |
| // if necessary."""; |
| |
| /// Creates a literal of the form |
| /// ```dart |
| /// const String someName = "ab\x82azx......" |
| /// "more bytes and escapes \xff " |
| /// "...."; |
| /// ``` |
| /// while escaping non-printable characters, `"`, `$` and `\`, |
| /// and trying to fit as many characters on each line as possible. |
| /// |
| /// Not optimized for speed or memory consumption. Assumed to be run |
| /// rarely and offline. |
| class StringLiteralBuilder { |
| final buffer = StringBuffer(); |
| String indent; |
| var lineLength = 0; |
| StringLiteralBuilder(String name, {int indent = 0}) |
| : indent = " " * (indent + 4) { |
| if (indent > 0) buffer.write(" " * indent); |
| buffer |
| ..write("const String ") |
| ..write(name) |
| ..write(" = \""); |
| lineLength = buffer.length; |
| } |
| |
| void writeBytes(Uint8List bytes, {bool hexAll = false}) { |
| for (var byte in bytes) { |
| var string = hexAll ? hex(byte) : charString(byte); |
| lineLength += string.length; |
| if (lineLength > 79) { |
| buffer |
| ..write('"\n') |
| ..write(indent) |
| ..write('"'); |
| lineLength = indent.length + 1 + string.length; |
| } |
| buffer.write(string); |
| } |
| } |
| |
| void writeChars(Uint16List chars, {bool hexAll = false}) { |
| for (var char in chars) { |
| var string = hexAll ? hex(char) : charString(char); |
| lineLength += string.length; |
| if (lineLength > 79) { |
| buffer |
| ..write('"\n') |
| ..write(indent) |
| ..write('"'); |
| lineLength = indent.length + 1 + string.length; |
| } |
| buffer.write(string); |
| } |
| } |
| |
| /// Terminates the string literal. |
| /// |
| /// Do not call use builder after calling close. |
| String close() { |
| if (lineLength < 78) { |
| buffer.write("\";\n"); |
| } else { |
| buffer |
| ..write("\"\n") |
| ..write(indent) |
| ..write(";\n"); |
| } |
| return buffer.toString(); |
| } |
| |
| static String charString(int char) { |
| // Recognized characters that need escaping, or has a short escape. |
| switch (char) { |
| case 0x08: |
| return r"\b"; |
| case 0x09: |
| return r"\t"; |
| case 0x0a: |
| return r"\n"; |
| case 0x0b: |
| return r"\v"; |
| case 0x0c: |
| return r"\f"; |
| case 0x0d: |
| return r"\r"; |
| case 0x22: |
| return r'\"'; |
| case 0x5c: |
| return r"\\"; |
| case 0x24: |
| return r"\$"; |
| } |
| // All control characters, all non-one-byte-string chars. |
| if (char > 0xFF || char & 0x60 == 0 || char == 0x7F) { |
| // 0x00 - 0x1F, 0x80 - 0xBF, 0x7F-... |
| return hex(char); |
| } |
| return String.fromCharCode(char); |
| } |
| |
| static String hex(int char) { |
| const digits = "0123456789ABCDEF"; |
| if (char <= 0xFF) { |
| return "\\x${digits[char >> 4]}${digits[char & 0xf]}"; |
| } |
| // Don't try to be clever. |
| return "\\u${char.toRadixString(16).padLeft(4, "0")}"; |
| } |
| } |
| |
| /// Creates the tables for `_scannerTables` used by [Uri.parse]. |
| /// |
| /// See `_scannerTables` in `sdk/lib/core/uri.dart` for the generated format. |
| /// |
| /// The concrete tables are chosen as a trade-off between the number of states |
| /// needed and the precision of the result. |
| /// This allows definitely recognizing the general structure of the URI |
| /// (presence and location of scheme, user-info, host, port, path, query and |
| /// fragment) while at the same time detecting that some components are not |
| /// in canonical form (anything containing a `%`, a host-name containing a |
| /// capital letter). Since the scanner doesn't know whether something is a |
| /// scheme or a path until it sees `:`, or user-info or host until it sees |
| /// a `@`, a second pass is needed to validate the scheme and any user-info |
| /// is considered non-canonical by default. |
| /// |
| /// The states (starting from [_uriStart]) write positions while scanning |
| /// a string from `start` to `end` as follows: |
| /// |
| /// - [_schemeEndIndex]: Should be initialized to `start-1`. |
| /// If the URI has a scheme, it is set to the position of the `:` after |
| /// the scheme. |
| /// - [_hostStartIndex]: Should be initialized to `start - 1`. |
| /// If the URI has an authority, it is set to the character before the |
| /// host name - either the second `/` in the `//` leading the authority, |
| /// or the `@` after a user-info. Comparing this value to the scheme end |
| /// position can be used to detect that there is a user-info component. |
| /// - [_portStartIndex]: Should be initialized to `start`. |
| /// Set to the position of the last `:` in an authority, and unchanged |
| /// if there is no authority or no `:` in an authority. |
| /// If this position is after the host start, there is a port, otherwise it |
| /// is just marking a colon in the user-info component. |
| /// - [_pathStartIndex]: Should be initialized to `start`. |
| /// Is set to the first path character unless the path is empty. |
| /// If the path is empty, the position is either unchanged (`start`) or |
| /// the first slash of an authority. So, if the path start is before a |
| /// host start or scheme end, the path is empty. |
| /// - [_queryStartIndex]: Should be initialized to `end`. |
| /// The position of the `?` leading a query if the URI contains a query. |
| /// - [_fragmentStartIndex]: Should be initialized to `end`. |
| /// The position of the `#` leading a fragment if the URI contains a fragment. |
| /// - [_notSimpleIndex]: Should be initialized to `start - 1`. |
| /// Set to another value if the URI is considered "not simple". |
| /// This is elaborated below. |
| /// |
| /// # Simple URIs |
| /// A URI is considered "simple" if it is in a normalized form containing no |
| /// escapes. This allows us to skip normalization and checking whether escapes |
| /// are valid, and to extract components without worrying about unescaping. |
| /// |
| /// The scanner computes a conservative approximation of being "simple". |
| /// It rejects any URI with an escape, with a user-info component (mainly |
| /// because they are rare and would increase the number of states in the |
| /// scanner significantly), with an IPV6 host or with a capital letter in |
| /// the scheme or host name (the scheme is handled in a second scan using |
| /// a separate two-state table). |
| /// Further, paths containing `..` or `.` path segments are considered |
| /// non-simple except for pure relative paths (no scheme or authority) starting |
| /// with a sequence of "../" segments. |
| /// |
| /// The transition tables cannot detect a trailing ".." in the path, |
| /// followed by a query or fragment, because the segment is not known to be |
| /// complete until we are past it, and we then need to store the query/fragment |
| /// start instead. This case is checked manually post-scanning (such a path |
| /// needs to be normalized to end in "../", so the URI shouldn't be considered |
| /// simple). |
| List<Uint8List> _createTables() { |
| // States used to scan a URI from scratch. |
| assert(_uriStart == 0); |
| const int uriStart = _uriStart; |
| const int schemeOrPath = uriStart + 1; |
| const int authOrPath = schemeOrPath + 1; |
| const int authOrPathSlash = authOrPath + 1; |
| const int userInfoOrHost0 = authOrPathSlash + 1; |
| const int userInfoOrHost = userInfoOrHost0 + 1; |
| const int userInfoOrPort0 = userInfoOrHost + 1; |
| const int userInfoOrPort = userInfoOrPort0 + 1; |
| const int ipv6Host = userInfoOrPort + 1; |
| const int relPathSeg = ipv6Host + 1; |
| const int pathSeg = relPathSeg + 1; |
| const int path = pathSeg + 1; |
| const int query = path + 1; |
| const int fragment = query + 1; |
| const int schemeOrPathDot = fragment + 1; // Path ends in `.`. |
| const int schemeOrPathDot2 = schemeOrPathDot + 1; // Path ends in `..`. |
| const int relPathSegDot = schemeOrPathDot2 + 1; // Path ends in `.`. |
| const int relPathSegDot2 = relPathSegDot + 1; // Path ends in `..`. |
| const int pathSegDot = relPathSegDot2 + 1; // Path ends in `.`. |
| const int pathSegDot2 = pathSegDot + 1; // Path ends in `..`. |
| assert(_notSimpleIndex == schemeOrPathDot); |
| |
| // States used to validate a scheme after its end position has been found. |
| // A separate state machine in the same table. |
| const int scheme0 = pathSegDot2 + 1; |
| const int scheme = scheme0 + 1; |
| assert(scheme0 == _schemeStart); |
| |
| // Total number of states for the scanner. |
| const int stateCount = scheme + 1; |
| assert(stateCount == _stateCount); |
| assert(1 << _stateBits >= stateCount); |
| |
| // Constants encoding the write-index for the state transition into the top 3 |
| // bits of a byte. |
| const int schemeEnd = _schemeEndIndex << 5; |
| const int hostStart = _hostStartIndex << 5; |
| const int portStart = _portStartIndex << 5; |
| const int pathStart = _pathStartIndex << 5; |
| const int queryStart = _queryStartIndex << 5; |
| const int fragmentStart = _fragmentStartIndex << 5; |
| const int notSimple = _notSimpleIndex << 5; |
| |
| /// The `unreserved` characters of RFC 3986. |
| const unreserved = |
| "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-._~"; |
| |
| /// The `sub-delim` characters of RFC 3986. |
| const subDelimiters = r"!$&'()*+,;="; |
| // The `pchar` characters of RFC 3986: characters that may occur in a path, |
| // excluding escapes. |
| const pchar = "$unreserved$subDelimiters"; |
| |
| var tables = List<Uint8List>.generate(stateCount, (_) => Uint8List(96)); |
| |
| // Helper function which initialize the table for [state] with a default |
| // transition and returns the table. |
| Uint8List build(int state, int defaultTransition) => |
| tables[state]..fillRange(0, 96, defaultTransition); |
| |
| // Helper function which sets the transition for each character in [chars] |
| // to [transition] in the [target] table. |
| // The [chars] string must contain only characters in the U+0020 .. U+007E |
| // range. |
| void setChars(Uint8List target, String chars, int transition) { |
| for (int i = 0; i < chars.length; i++) { |
| var char = chars.codeUnitAt(i); |
| target[char ^ 0x60] = transition; |
| } |
| } |
| |
| // Helper function which sets the transition for all characters in the |
| // range from `range[0]` to `range[1]` to [transition] in the [target] table. |
| // |
| // The [range] must be a two-character string where both characters are in |
| // the U+0020 .. U+007E range and the former character must have a lower |
| // code point than the latter. |
| void setRange(Uint8List target, String range, int transition) { |
| for (int i = range.codeUnitAt(0), n = range.codeUnitAt(1); i <= n; i++) { |
| target[i ^ 0x60] = transition; |
| } |
| } |
| |
| // Create the transitions for each state. |
| Uint8List b; |
| |
| // Entry point of URI-scanner state machine. |
| // Validate as path. If it is a scheme, we recognize that |
| // and validate it later. |
| b = build(uriStart, schemeOrPath | notSimple); |
| setChars(b, pchar, schemeOrPath); |
| setChars(b, ".", schemeOrPathDot); |
| setChars(b, ":", authOrPath | schemeEnd); // Handle later. |
| setChars(b, "/", authOrPathSlash); |
| setChars(b, r"\", authOrPathSlash | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(schemeOrPathDot, schemeOrPath | notSimple); |
| setChars(b, pchar, schemeOrPath); |
| setChars(b, ".", schemeOrPathDot2); |
| setChars(b, ':', authOrPath | schemeEnd); |
| setChars(b, r"/\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(schemeOrPathDot2, schemeOrPath | notSimple); |
| setChars(b, pchar, schemeOrPath); |
| setChars(b, "%", schemeOrPath | notSimple); |
| setChars(b, ':', authOrPath | schemeEnd); |
| setChars(b, "/", relPathSeg); |
| setChars(b, r"\", relPathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(schemeOrPath, schemeOrPath | notSimple); |
| setChars(b, pchar, schemeOrPath); |
| setChars(b, ':', authOrPath | schemeEnd); |
| setChars(b, "/", pathSeg); |
| setChars(b, r"\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(authOrPath, path | notSimple); |
| setChars(b, pchar, path | pathStart); |
| setChars(b, "/", authOrPathSlash | pathStart); |
| setChars(b, r"\", authOrPathSlash | pathStart); // This should be non-simple. |
| setChars(b, ".", pathSegDot | pathStart); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(authOrPathSlash, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, "/", userInfoOrHost0 | hostStart); |
| setChars(b, r"\", userInfoOrHost0 | hostStart); // This should be non-simple. |
| setChars(b, ".", pathSegDot); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(userInfoOrHost0, userInfoOrHost | notSimple); |
| setChars(b, pchar, userInfoOrHost); |
| setRange(b, "AZ", userInfoOrHost | notSimple); |
| setChars(b, ":", userInfoOrPort0 | portStart); |
| setChars(b, "@", userInfoOrHost0 | hostStart); |
| setChars(b, "[", ipv6Host | notSimple); |
| setChars(b, "/", pathSeg | pathStart); |
| setChars(b, r"\", pathSeg | pathStart); // This should be non-simple. |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(userInfoOrHost, userInfoOrHost | notSimple); |
| setChars(b, pchar, userInfoOrHost); |
| setRange(b, "AZ", userInfoOrHost | notSimple); |
| setChars(b, ":", userInfoOrPort0 | portStart); |
| setChars(b, "@", userInfoOrHost0 | hostStart); |
| setChars(b, "/", pathSeg | pathStart); |
| setChars(b, r"\", pathSeg | pathStart); // This should be non-simple. |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(userInfoOrPort0, userInfoOrPort | notSimple); |
| setRange(b, "19", userInfoOrPort); |
| setChars(b, "@", userInfoOrHost0 | hostStart); |
| setChars(b, "/", pathSeg | pathStart); |
| setChars(b, r"\", pathSeg | pathStart); // This should be non-simple. |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(userInfoOrPort, userInfoOrPort | notSimple); |
| setRange(b, "09", userInfoOrPort); |
| setChars(b, "@", userInfoOrHost0 | hostStart); |
| setChars(b, "/", pathSeg | pathStart); |
| setChars(b, r"\", pathSeg | pathStart); // This should be non-simple. |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(ipv6Host, ipv6Host); |
| setChars(b, "]", userInfoOrHost); |
| |
| b = build(relPathSeg, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, ".", relPathSegDot); |
| setChars(b, r"/\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(relPathSegDot, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, ".", relPathSegDot2); |
| setChars(b, r"/\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(relPathSegDot2, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, "/", relPathSeg); |
| setChars(b, r"\", relPathSeg | notSimple); |
| setChars(b, "?", query | queryStart); // This should be non-simple. |
| setChars(b, "#", fragment | fragmentStart); // This should be non-simple. |
| |
| b = build(pathSeg, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, ".", pathSegDot); |
| setChars(b, "/", pathSeg); |
| setChars(b, r"\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(pathSegDot, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, ".", pathSegDot2); |
| setChars(b, r"/\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(pathSegDot2, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, r"/\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(path, path | notSimple); |
| setChars(b, pchar, path); |
| setChars(b, "/", pathSeg); |
| setChars(b, r"\", pathSeg | notSimple); |
| setChars(b, "?", query | queryStart); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(query, query | notSimple); |
| setChars(b, pchar, query); |
| setChars(b, "?", query); |
| setChars(b, "#", fragment | fragmentStart); |
| |
| b = build(fragment, fragment | notSimple); |
| setChars(b, pchar, fragment); |
| setChars(b, "?", fragment); |
| |
| // A separate two-state validator for lower-case scheme names. |
| // Any non-scheme character or upper-case letter is marked as non-simple. |
| b = build(scheme0, scheme | notSimple); |
| setRange(b, "az", scheme); |
| |
| b = build(scheme, scheme | notSimple); |
| setRange(b, "az", scheme); |
| setRange(b, "09", scheme); |
| setChars(b, "+-.", scheme); |
| |
| return tables; |
| } |