Reapply "Avoid computing the URI scanner tables at runtime."
This reverts commit 855e1cd975feae07822187f226568130a0fcfbe9.
The blocking issue in internal test code is assumed fixed.
Change-Id: I74e0be130d149a45f77dc90c354916308b76b741
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/255248
Commit-Queue: Lasse Nielsen <lrn@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
Auto-Submit: Lasse Nielsen <lrn@google.com>
diff --git a/sdk/lib/core/uri.dart b/sdk/lib/core/uri.dart
index bb84862..b465edf 100644
--- a/sdk/lib/core/uri.dart
+++ b/sdk/lib/core/uri.dart
@@ -3977,6 +3977,10 @@
static const _uricTable = _Uri._queryCharTable;
}
+// --- URI PARSER TABLE --- start --- generated code, do not edit ---
+// Use tools/generate_uri_parser_tables.dart to generate this code
+// if necessary.
+
// --------------------------------------------------------------------
// Constants used to read the scanner result.
// The indices points into the table filled by [_scan] which contains
@@ -4005,7 +4009,7 @@
const int _notSimpleIndex = 7;
// Initial state for scanner.
-const int _uriStart = 00;
+const int _uriStart = 0;
// If scanning of a URI terminates in this state or above,
// consider the URI non-simple
@@ -4014,14 +4018,15 @@
// Initial state for scheme validation.
const int _schemeStart = 20;
+// --------------------------------------------------------------------
/// Transition tables are used to scan a URI to determine its structure.
///
/// The tables represent a state machine with output.
///
/// To scan the URI, start in the [_uriStart] state, then read each character
/// of the URI in order, from start to end, and for each character perform a
-/// transition to a new state while writing the current position into the output
-/// buffer at a designated index.
+/// transition to a new state while writing the current position
+/// into the output buffer at a designated index.
///
/// Each state, represented by an integer which is an index into
/// [_scannerTables], has a set of transitions, one for each character.
@@ -4030,320 +4035,133 @@
///
/// For URI scanning, only characters in the range U+0020 through U+007E are
/// interesting; all characters outside that range are treated the same.
-/// The tables only contain 96 entries, representing the characters in the
-/// interesting range, plus one more to represent all values outside the range.
-/// The character entries are stored in one `Uint8List` per state, with the
-/// transition for a character at position `character ^ 0x60`,
+/// The tables only contain 96 entries, representing the 95 characters in the
+/// interesting range, and one entry for all values outside the range.
+/// The character entries are stored in one `String` of 96 characters per state,
+/// with the transition for a character at position `character ^ 0x60`,
/// which maps the range U+0020 .. U+007F into positions 0 .. 95.
/// All remaining characters are mapped to position 31 (`0x7f ^ 0x60`), which
/// represents the transition for all remaining characters.
-final List<Uint8List> _scannerTables = _createTables();
-
-// ----------------------------------------------------------------------
-// Code to create the URI scanner table.
-
-/// Creates the tables for [_scannerTables] used by [Uri.parse].
-///
-/// See [_scannerTables] for the generated format.
-///
-/// The concrete tables are chosen as a trade-off between the number of states
-/// needed and the precision of the result.
-/// This allows definitely recognizing the general structure of the URI
-/// (presence and location of scheme, user-info, host, port, path, query and
-/// fragment) while at the same time detecting that some components are not
-/// in canonical form (anything containing a `%`, a host-name containing a
-/// capital letter). Since the scanner doesn't know whether something is a
-/// scheme or a path until it sees `:`, or user-info or host until it sees
-/// a `@`, a second pass is needed to validate the scheme and any user-info
-/// is considered non-canonical by default.
-///
-/// The states (starting from [_uriStart]) write positions while scanning
-/// a string from `start` to `end` as follows:
-///
-/// - [_schemeEndIndex]: Should be initialized to `start-1`.
-/// If the URI has a scheme, it is set to the position of the `:` after
-/// the scheme.
-/// - [_hostStartIndex]: Should be initialized to `start - 1`.
-/// If the URI has an authority, it is set to the character before the
-/// host name - either the second `/` in the `//` leading the authority,
-/// or the `@` after a user-info. Comparing this value to the scheme end
-/// position can be used to detect that there is a user-info component.
-/// - [_portStartIndex]: Should be initialized to `start`.
-/// Set to the position of the last `:` in an authority, and unchanged
-/// if there is no authority or no `:` in an authority.
-/// If this position is after the host start, there is a port, otherwise it
-/// is just marking a colon in the user-info component.
-/// - [_pathStartIndex]: Should be initialized to `start`.
-/// Is set to the first path character unless the path is empty.
-/// If the path is empty, the position is either unchanged (`start`) or
-/// the first slash of an authority. So, if the path start is before a
-/// host start or scheme end, the path is empty.
-/// - [_queryStartIndex]: Should be initialized to `end`.
-/// The position of the `?` leading a query if the URI contains a query.
-/// - [_fragmentStartIndex]: Should be initialized to `end`.
-/// The position of the `#` leading a fragment if the URI contains a fragment.
-/// - [_notSimpleIndex]: Should be initialized to `start - 1`.
-/// Set to another value if the URI is considered "not simple".
-/// This is elaborated below.
-///
-/// # Simple URIs
-/// A URI is considered "simple" if it is in a normalized form containing no
-/// escapes. This allows us to skip normalization and checking whether escapes
-/// are valid, and to extract components without worrying about unescaping.
-///
-/// The scanner computes a conservative approximation of being "simple".
-/// It rejects any URI with an escape, with a user-info component (mainly
-/// because they are rare and would increase the number of states in the
-/// scanner significantly), with an IPV6 host or with a capital letter in
-/// the scheme or host name (the scheme is handled in a second scan using
-/// a separate two-state table).
-/// Further, paths containing `..` or `.` path segments are considered
-/// non-simple except for pure relative paths (no scheme or authority) starting
-/// with a sequence of "../" segments.
-///
-/// The transition tables cannot detect a trailing ".." in the path,
-/// followed by a query or fragment, because the segment is not known to be
-/// complete until we are past it, and we then need to store the query/fragment
-/// start instead. This cast is checked manually post-scanning (such a path
-/// needs to be normalized to end in "../", so the URI shouldn't be considered
-/// simple).
-List<Uint8List> _createTables() {
- // TODO(lrn): Use a precomputed table.
-
- // Total number of states for the scanner.
- const int stateCount = 22;
-
- // States used to scan a URI from scratch.
- const int schemeOrPath = 01;
- const int authOrPath = 02;
- const int authOrPathSlash = 03;
- const int uinfoOrHost0 = 04;
- const int uinfoOrHost = 05;
- const int uinfoOrPort0 = 06;
- const int uinfoOrPort = 07;
- const int ipv6Host = 08;
- const int relPathSeg = 09;
- const int pathSeg = 10;
- const int path = 11;
- const int query = 12;
- const int fragment = 13;
- const int schemeOrPathDot = 14;
- const int schemeOrPathDot2 = 15;
- const int relPathSegDot = 16;
- const int relPathSegDot2 = 17;
- const int pathSegDot = 18;
- const int pathSegDot2 = 19;
-
- // States used to validate a scheme after its end position has been found.
- const int scheme0 = _schemeStart;
- const int scheme = 21;
-
- // Constants encoding the write-index for the state transition into the top 5
- // bits of a byte.
- const int schemeEnd = _schemeEndIndex << 5;
- const int hostStart = _hostStartIndex << 5;
- const int portStart = _portStartIndex << 5;
- const int pathStart = _pathStartIndex << 5;
- const int queryStart = _queryStartIndex << 5;
- const int fragmentStart = _fragmentStartIndex << 5;
- const int notSimple = _notSimpleIndex << 5;
-
- /// The `unreserved` characters of RFC 3986.
- const unreserved =
- "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-._~";
-
- /// The `sub-delim` characters of RFC 3986.
- const subDelims = r"!$&'()*+,;=";
- // The `pchar` characters of RFC 3986: characters that may occur in a path,
- // excluding escapes.
- const pchar = "$unreserved$subDelims";
-
- var tables = List<Uint8List>.generate(stateCount, (_) => Uint8List(96));
-
- // Helper function which initialize the table for [state] with a default
- // transition and returns the table.
- Uint8List build(state, defaultTransition) =>
- tables[state]..fillRange(0, 96, defaultTransition);
-
- // Helper function which sets the transition for each character in [chars]
- // to [transition] in the [target] table.
- // The [chars] string must contain only characters in the U+0020 .. U+007E
- // range.
- void setChars(Uint8List target, String chars, int transition) {
- for (int i = 0; i < chars.length; i++) {
- var char = chars.codeUnitAt(i);
- target[char ^ 0x60] = transition;
- }
- }
-
- /// Helper function which sets the transition for all characters in the
- /// range from `range[0]` to `range[1]` to [transition] in the [target] table.
- ///
- /// The [range] must be a two-character string where both characters are in
- /// the U+0020 .. U+007E range and the former character must have a lower
- /// code point than the latter.
- void setRange(Uint8List target, String range, int transition) {
- for (int i = range.codeUnitAt(0), n = range.codeUnitAt(1); i <= n; i++) {
- target[i ^ 0x60] = transition;
- }
- }
-
- // Create the transitions for each state.
- var b;
-
- // Validate as path, if it is a scheme, we handle it later.
- b = build(_uriStart, schemeOrPath | notSimple);
- setChars(b, pchar, schemeOrPath);
- setChars(b, ".", schemeOrPathDot);
- setChars(b, ":", authOrPath | schemeEnd); // Handle later.
- setChars(b, "/", authOrPathSlash);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(schemeOrPathDot, schemeOrPath | notSimple);
- setChars(b, pchar, schemeOrPath);
- setChars(b, ".", schemeOrPathDot2);
- setChars(b, ':', authOrPath | schemeEnd);
- setChars(b, "/", pathSeg | notSimple);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(schemeOrPathDot2, schemeOrPath | notSimple);
- setChars(b, pchar, schemeOrPath);
- setChars(b, "%", schemeOrPath | notSimple);
- setChars(b, ':', authOrPath | schemeEnd);
- setChars(b, "/", relPathSeg);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(schemeOrPath, schemeOrPath | notSimple);
- setChars(b, pchar, schemeOrPath);
- setChars(b, ':', authOrPath | schemeEnd);
- setChars(b, "/", pathSeg);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(authOrPath, path | notSimple);
- setChars(b, pchar, path | pathStart);
- setChars(b, "/", authOrPathSlash | pathStart);
- setChars(b, ".", pathSegDot | pathStart);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(authOrPathSlash, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, "/", uinfoOrHost0 | hostStart);
- setChars(b, ".", pathSegDot);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(uinfoOrHost0, uinfoOrHost | notSimple);
- setChars(b, pchar, uinfoOrHost);
- setRange(b, "AZ", uinfoOrHost | notSimple);
- setChars(b, ":", uinfoOrPort0 | portStart);
- setChars(b, "@", uinfoOrHost0 | hostStart);
- setChars(b, "[", ipv6Host | notSimple);
- setChars(b, "/", pathSeg | pathStart);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(uinfoOrHost, uinfoOrHost | notSimple);
- setChars(b, pchar, uinfoOrHost);
- setRange(b, "AZ", uinfoOrHost | notSimple);
- setChars(b, ":", uinfoOrPort0 | portStart);
- setChars(b, "@", uinfoOrHost0 | hostStart);
- setChars(b, "/", pathSeg | pathStart);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(uinfoOrPort0, uinfoOrPort | notSimple);
- setRange(b, "19", uinfoOrPort);
- setChars(b, "@", uinfoOrHost0 | hostStart);
- setChars(b, "/", pathSeg | pathStart);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(uinfoOrPort, uinfoOrPort | notSimple);
- setRange(b, "09", uinfoOrPort);
- setChars(b, "@", uinfoOrHost0 | hostStart);
- setChars(b, "/", pathSeg | pathStart);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(ipv6Host, ipv6Host);
- setChars(b, "]", uinfoOrHost);
-
- b = build(relPathSeg, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, ".", relPathSegDot);
- setChars(b, "/", pathSeg | notSimple);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(relPathSegDot, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, ".", relPathSegDot2);
- setChars(b, "/", pathSeg | notSimple);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(relPathSegDot2, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, "/", relPathSeg);
- setChars(b, "?", query | queryStart); // This should be non-simple.
- setChars(b, "#", fragment | fragmentStart); // This should be non-simple.
-
- b = build(pathSeg, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, ".", pathSegDot);
- setChars(b, "/", pathSeg | notSimple);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(pathSegDot, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, ".", pathSegDot2);
- setChars(b, "/", pathSeg | notSimple);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(pathSegDot2, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, "/", pathSeg | notSimple);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(path, path | notSimple);
- setChars(b, pchar, path);
- setChars(b, "/", pathSeg);
- setChars(b, "?", query | queryStart);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(query, query | notSimple);
- setChars(b, pchar, query);
- setChars(b, "?", query);
- setChars(b, "#", fragment | fragmentStart);
-
- b = build(fragment, fragment | notSimple);
- setChars(b, pchar, fragment);
- setChars(b, "?", fragment);
-
- // A separate two-state validator for lower-case scheme names.
- // Any non-scheme character or upper-case letter is marked as non-simple.
- b = build(scheme0, scheme | notSimple);
- setRange(b, "az", scheme);
-
- b = build(scheme, scheme | notSimple);
- setRange(b, "az", scheme);
- setRange(b, "09", scheme);
- setChars(b, "+-.", scheme);
-
- return tables;
-}
+const String _scannerTables = "\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xE1\xE1\xE1"
+ "\x01\xE1\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xE1\xE1\xE1\xE1\x01\xE1\x01"
+ "\xE1\xCD\x01\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x0E\x03\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x22\x01\xE1\x01\xE1\xAC\xE1\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\xE1\xE1\xE1\x01\xE1\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xE1"
+ "\xE1\xE1\xE1\x01\xE1\x01\xE1\xCD\x01\xE1\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x0A\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x22\x01\xE1\x01\xE1\xAC"
+ "\xEB\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B"
+ "\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\xEB\xEB\xEB\x8B\xEB\xEB\x8B\x8B\x8B"
+ "\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B"
+ "\x8B\x8B\x8B\x8B\x8B\xEB\xEB\xEB\xEB\x8B\xEB\x8B\xEB\xCD\x8B\xEB\x8B\x8B"
+ "\x8B\x8B\x8B\x8B\x8B\x8B\x92\x83\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B\x8B"
+ "\xEB\x8B\xEB\x8B\xEB\xAC\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB"
+ "\x0B\xEB\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB\xEB\x0B\xEB\x0B"
+ "\xEB\xCD\x0B\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x12\x44\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\xEB\x0B\xEB\x0B\xEB\xAC\xE5\x05\x05\x05\x05\x05"
+ "\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05"
+ "\x05\x05\x05\xE5\xE5\xE5\x05\xE5\x44\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5"
+ "\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE8"
+ "\xE5\xE5\xE5\x05\xE5\x05\xE5\xCD\x05\xE5\x05\x05\x05\x05\x05\x05\x05\x05"
+ "\x05\x8A\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x66\x05\xE5\x05\xE5\xAC"
+ "\xE5\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05"
+ "\x05\x05\x05\x05\x05\x05\x05\x05\x05\xE5\xE5\xE5\x05\xE5\x44\xE5\xE5\xE5"
+ "\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5"
+ "\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\xE5\x05\xE5\x05\xE5\xCD\x05\xE5\x05\x05"
+ "\x05\x05\x05\x05\x05\x05\x05\x8A\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05"
+ "\x66\x05\xE5\x05\xE5\xAC\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xE7\x44\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xCD\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\x8A\xE7\x07\x07\x07"
+ "\x07\x07\x07\x07\x07\x07\xE7\xE7\xE7\xE7\xE7\xAC\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\x44\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xCD\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7\xE7"
+ "\xE7\x8A\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\xE7\xE7\xE7\xE7\xE7\xAC"
+ "\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08"
+ "\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08"
+ "\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08"
+ "\x08\x08\x08\x08\x08\x08\x08\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08"
+ "\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08"
+ "\x08\x08\x08\x08\x08\x08\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB"
+ "\x0B\xEB\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB\xEB\x0B\xEB\x0B"
+ "\xEB\xCD\x0B\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x10\xEA\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\xEB\x0B\xEB\x0B\xEB\xAC\xEB\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\xEB\xEB\xEB\x0B\xEB\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB"
+ "\xEB\xEB\xEB\x0B\xEB\x0B\xEB\xCD\x0B\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x12\xEA\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\x0B\xEB\x0B\xEB\xAC"
+ "\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB\x0B\xEB\xEB\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB\xEB\x0B\xEB\x0B\xEB\xCD\x0B\xEB\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0A\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\xEB\x0B\xEB\x0B\xEB\xAC\xEC\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C"
+ "\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\xEC\xEC\xEC"
+ "\x0C\xEC\xEC\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C"
+ "\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\xEC\xEC\xEC\xEC\x0C\xEC\x0C"
+ "\xEC\xCD\x0C\xEC\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\x0C\xEC\x0C\x0C\x0C\x0C"
+ "\x0C\x0C\x0C\x0C\x0C\x0C\xEC\x0C\xEC\x0C\xEC\x0C\xED\x0D\x0D\x0D\x0D\x0D"
+ "\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D"
+ "\x0D\x0D\x0D\xED\xED\xED\x0D\xED\xED\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D"
+ "\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\xED"
+ "\xED\xED\xED\x0D\xED\x0D\xED\xED\x0D\xED\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D"
+ "\x0D\xED\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\x0D\xED\x0D\xED\x0D\xED\x0D"
+ "\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\xE1\xE1\xE1\x01\xE1\xE1\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\xE1\xE1\xE1\xE1\x01\xE1\x01\xE1\xCD\x01\xE1\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x0F\xEA\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x22\x01\xE1\x01\xE1\xAC\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xE1\xE1\xE1"
+ "\x01\xE1\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xE1\xE1\xE1\xE1\x01\xE1\x01"
+ "\xE1\xCD\x01\xE1\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x01\x01\x01\x01"
+ "\x01\x01\x01\x01\x01\x01\x22\x01\xE1\x01\xE1\xAC\xEB\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\xEB\xEB\xEB\x0B\xEB\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB"
+ "\xEB\xEB\xEB\x0B\xEB\x0B\xEB\xCD\x0B\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x11\xEA\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\x0B\xEB\x0B\xEB\xAC"
+ "\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB\x0B\xEB\xEB\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB\xEB\x0B\xEB\x0B\xEB\xCD\x0B\xEB\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x09\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\xEB\x0B\xEB\x0B\xEB\xAC\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB"
+ "\x0B\xEB\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\xEB\xEB\xEB\x0B\xEB\x0B"
+ "\xEB\xCD\x0B\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x13\xEA\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\xEB\x0B\xEB\x0B\xEB\xAC\xEB\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\xEB\xEB\xEB\x0B\xEB\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB"
+ "\xEB\xEB\xEB\x0B\xEB\x0B\xEB\xCD\x0B\xEB\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B"
+ "\x0B\xEA\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\x0B\xEB\x0B\xEB\x0B\xEB\xAC"
+ "\xF5\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15"
+ "\x15\x15\x15\x15\x15\x15\x15\x15\x15\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5"
+ "\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5"
+ "\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5"
+ "\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5"
+ "\xF5\xF5\xF5\xF5\xF5\xF5\xF5\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15"
+ "\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\xF5\xF5\xF5"
+ "\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5"
+ "\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5"
+ "\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\xF5\x15\xF5\x15\x15\xF5\x15\x15\x15\x15"
+ "\x15\x15\x15\x15\x15\x15\xF5\xF5\xF5\xF5\xF5\xF5";
// --------------------------------------------------------------------
-// Code that uses the URI scanner table.
-
/// Scan a string using the [_scannerTables] state machine.
///
/// Scans [uri] from [start] to [end], starting in state [state] and
@@ -4351,20 +4169,20 @@
///
/// Returns the final state.
int _scan(String uri, int start, int end, int state, List<int> indices) {
- var tables = _scannerTables;
+ const int stateTableSize = 96;
assert(end <= uri.length);
for (int i = start; i < end; i++) {
- var table = tables[state];
// Xor with 0x60 to move range 0x20-0x7f into 0x00-0x5f
int char = uri.codeUnitAt(i) ^ 0x60;
// Use 0x1f (nee 0x7f) to represent all unhandled characters.
if (char > 0x5f) char = 0x1f;
- int transition = table[char];
+ int transition = _scannerTables.codeUnitAt(state * stateTableSize + char);
state = transition & 0x1f;
indices[transition >> 5] = i;
}
return state;
}
+// --- URI PARSER TABLE --- end --- generated code, do not edit ---
class _SimpleUri implements Uri {
final String _uri;
diff --git a/tools/generate_uri_parser_tables.dart b/tools/generate_uri_parser_tables.dart
new file mode 100644
index 0000000..419d955
--- /dev/null
+++ b/tools/generate_uri_parser_tables.dart
@@ -0,0 +1,568 @@
+// Copyright (c) 2022, the Dart project authors. Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+// ----------------------------------------------------------------------
+// Code to create the URI scanner table used by `uri.dart`.
+//
+// This file exists in case someone, some day, will want to change the
+// representation of the tables, maybe if Dart gets `Uint8List` literals.
+// It should not otherwise be necessary to re-generate the tables.
+//
+// The table is stored in the `uri.dart` file as a 1-byte string literal.
+// This script generates the string literal and prints it on stdout.
+// If passed the `-u filename` flag, it instead updates the file directly.
+// The file should be the `sdk/lib/core/uri.dart` file, which contains markers
+// showing where to insert the generated code.
+
+import "dart:io";
+import "dart:typed_data";
+
+/// Index of the position of that `:` after a scheme.
+const int _schemeEndIndex = 1;
+
+/// Index of the position of the character just before the host name.
+const int _hostStartIndex = 2;
+
+/// Index of the position of the `:` before a port value.
+const int _portStartIndex = 3;
+
+/// Index of the position of the first character of a path.
+const int _pathStartIndex = 4;
+
+/// Index of the position of the `?` before a query.
+const int _queryStartIndex = 5;
+
+/// Index of the position of the `#` before a fragment.
+const int _fragmentStartIndex = 6;
+
+/// Index of a position where the URI was determined to be "non-simple".
+const int _notSimpleIndex = 7;
+
+// Initial state for scanner.
+const int _uriStart = 0;
+
+// If scanning of a URI terminates in this state or above,
+// consider the URI non-simple
+const int _nonSimpleEndStates = 14;
+
+// Initial state for scheme validation.
+const int _schemeStart = 20;
+
+void main(List<String> args) {
+ var tables = _createTables();
+ var literalBuilder = StringLiteralBuilder("_scannerTables");
+ for (var table in tables) {
+ literalBuilder.writeBytes(table, hexAll: true);
+ }
+ var tableString = literalBuilder.close();
+
+ var result = """
+// Use tools/generate_uri_parser_tables.dart to generate this code
+// if necessary.
+
+// --------------------------------------------------------------------
+// Constants used to read the scanner result.
+// The indices points into the table filled by [_scan] which contains
+// recognized positions in the scanned URI.
+// The `0` index is only used internally.
+
+/// Index of the position of that `:` after a scheme.
+const int _schemeEndIndex = $_schemeEndIndex;
+
+/// Index of the position of the character just before the host name.
+const int _hostStartIndex = $_hostStartIndex;
+
+/// Index of the position of the `:` before a port value.
+const int _portStartIndex = $_portStartIndex;
+
+/// Index of the position of the first character of a path.
+const int _pathStartIndex = $_pathStartIndex;
+
+/// Index of the position of the `?` before a query.
+const int _queryStartIndex = $_queryStartIndex;
+
+/// Index of the position of the `#` before a fragment.
+const int _fragmentStartIndex = $_fragmentStartIndex;
+
+/// Index of a position where the URI was determined to be "non-simple".
+const int _notSimpleIndex = $_notSimpleIndex;
+
+// Initial state for scanner.
+const int _uriStart = $_uriStart;
+
+// If scanning of a URI terminates in this state or above,
+// consider the URI non-simple
+const int _nonSimpleEndStates = $_nonSimpleEndStates;
+
+// Initial state for scheme validation.
+const int _schemeStart = $_schemeStart;
+
+// --------------------------------------------------------------------
+/// Transition tables are used to scan a URI to determine its structure.
+///
+/// The tables represent a state machine with output.
+///
+/// To scan the URI, start in the [_uriStart] state, then read each character
+/// of the URI in order, from start to end, and for each character perform a
+/// transition to a new state while writing the current position
+/// into the output buffer at a designated index.
+///
+/// Each state, represented by an integer which is an index into
+/// [_scannerTables], has a set of transitions, one for each character.
+/// The transitions are encoded as a 5-bit integer representing the next state
+/// and a 3-bit index into the output table.
+///
+/// For URI scanning, only characters in the range U+0020 through U+007E are
+/// interesting; all characters outside that range are treated the same.
+/// The tables only contain 96 entries, representing the 95 characters in the
+/// interesting range, and one entry for all values outside the range.
+/// The character entries are stored in one `String` of 96 characters per state,
+/// with the transition for a character at position `character ^ 0x60`,
+/// which maps the range U+0020 .. U+007F into positions 0 .. 95.
+/// All remaining characters are mapped to position 31 (`0x7f ^ 0x60`), which
+/// represents the transition for all remaining characters.
+$tableString
+// --------------------------------------------------------------------
+/// Scan a string using the [_scannerTables] state machine.
+///
+/// Scans [uri] from [start] to [end], starting in state [state] and
+/// writing output into [indices].
+///
+/// Returns the final state.
+int _scan(String uri, int start, int end, int state, List<int> indices) {
+ const int stateTableSize = 96;
+ assert(end <= uri.length);
+ for (int i = start; i < end; i++) {
+ // Xor with 0x60 to move range 0x20-0x7f into 0x00-0x5f
+ int char = uri.codeUnitAt(i) ^ 0x60;
+ // Use 0x1f (nee 0x7f) to represent all unhandled characters.
+ if (char > 0x5f) char = 0x1f;
+ int transition = _scannerTables.codeUnitAt(state * stateTableSize + char);
+ state = transition & 0x1f;
+ indices[transition >> 5] = i;
+ }
+ return state;
+}
+""";
+ if (args.isEmpty || !args.first.startsWith("-u")) {
+ print(result);
+ return;
+ }
+ var arg = args.first;
+ var filePath = "sdk/lib/core/uri.dart";
+ // Default file location, if run from root of SDK.
+ if (arg.length > 2) {
+ filePath = arg.substring(2);
+ } else if (args.length > 1) {
+ filePath = args[1];
+ }
+ var file = File(filePath);
+ if (!file.existsSync()) {
+ stderr.writeln("Cannot find file: $filePath");
+ exit(1);
+ }
+ var contents = file.readAsStringSync();
+ var pattern = RegExp(r"^// --- URI PARSER TABLE --- (start|end) --- [^]*?^",
+ multiLine: true);
+ var matches = pattern.allMatches(contents).toList();
+ if (matches.length != 2) {
+ stderr.writeln("Cannot find marked section in file $filePath");
+ exit(1);
+ }
+ var start = matches.first.end;
+ var end = matches.last.start;
+ var newContents = contents.replaceRange(start, end, result);
+ if (newContents != contents) {
+ file.writeAsStringSync(newContents);
+ print("$filePath updated.");
+ } else {
+ stderr.writeln("No update needed.");
+ return;
+ }
+}
+
+/// Creates a literal of the form
+/// ```dart
+/// const String someName = "ab\x82azx......"
+/// "more bytes and escapes \xff "
+/// "....";
+/// ```
+/// while escaping non-printable charactes, `"`, `$` and `\`,
+/// and trying to fit as many characters on each line as possible.
+class StringLiteralBuilder {
+ final buffer = StringBuffer();
+ String indent;
+ var lineLength = 0;
+ StringLiteralBuilder(String name, {int indent = 0})
+ : indent = " " * (indent + 4) {
+ if (indent > 0) buffer.write(" " * indent);
+ buffer
+ ..write("const String ")
+ ..write(name)
+ ..write(" = \"");
+ lineLength = buffer.length;
+ }
+
+ void writeBytes(Uint8List bytes, {bool hexAll = false}) {
+ for (var byte in bytes) {
+ var string = hexAll ? hex(byte) : charString(byte);
+ lineLength += string.length;
+ if (lineLength > 79) {
+ buffer
+ ..write('"\n')
+ ..write(indent)
+ ..write('"');
+ lineLength = indent.length + 1 + string.length;
+ }
+ buffer.write(string);
+ }
+ }
+
+ /// Terminates the string literal.
+ ///
+ /// Do not call use builder after calling close.
+ String close() {
+ if (lineLength < 78) {
+ buffer.write("\";\n");
+ } else {
+ buffer
+ ..write("\"\n")
+ ..write(indent)
+ ..write(";\n");
+ }
+ return buffer.toString();
+ }
+
+ static String charString(int byte) {
+ // Recognized characters that need escaping, or has a short escape.
+ switch (byte) {
+ case 0x08:
+ return r"\b";
+ case 0x09:
+ return r"\t";
+ case 0x0a:
+ return r"\n";
+ case 0x0b:
+ return r"\v";
+ case 0x0c:
+ return r"\f";
+ case 0x0d:
+ return r"\r";
+ case 0x22:
+ return r'\"';
+ case 0x5c:
+ return r"\\";
+ case 0x24:
+ return r"\$";
+ }
+ // All control characters.
+ if (byte & 0x60 == 0 || byte == 0x7F) {
+ // 0x00 - 0x1F, 0x80 - 0xBF, 0x7F
+ return hex(byte);
+ }
+ return String.fromCharCode(byte);
+ }
+
+ static String hex(int byte) {
+ const digits = "0123456789ABCDEF";
+ return "\\x${digits[byte >> 4]}${digits[byte & 0xf]}";
+ }
+}
+
+/// Creates the tables for [_scannerTables] used by [Uri.parse].
+///
+/// See [_scannerTables] for the generated format.
+///
+/// The concrete tables are chosen as a trade-off between the number of states
+/// needed and the precision of the result.
+/// This allows definitely recognizing the general structure of the URI
+/// (presence and location of scheme, user-info, host, port, path, query and
+/// fragment) while at the same time detecting that some components are not
+/// in canonical form (anything containing a `%`, a host-name containing a
+/// capital letter). Since the scanner doesn't know whether something is a
+/// scheme or a path until it sees `:`, or user-info or host until it sees
+/// a `@`, a second pass is needed to validate the scheme and any user-info
+/// is considered non-canonical by default.
+///
+/// The states (starting from [_uriStart]) write positions while scanning
+/// a string from `start` to `end` as follows:
+///
+/// - [_schemeEndIndex]: Should be initialized to `start-1`.
+/// If the URI has a scheme, it is set to the position of the `:` after
+/// the scheme.
+/// - [_hostStartIndex]: Should be initialized to `start - 1`.
+/// If the URI has an authority, it is set to the character before the
+/// host name - either the second `/` in the `//` leading the authority,
+/// or the `@` after a user-info. Comparing this value to the scheme end
+/// position can be used to detect that there is a user-info component.
+/// - [_portStartIndex]: Should be initialized to `start`.
+/// Set to the position of the last `:` in an authority, and unchanged
+/// if there is no authority or no `:` in an authority.
+/// If this position is after the host start, there is a port, otherwise it
+/// is just marking a colon in the user-info component.
+/// - [_pathStartIndex]: Should be initialized to `start`.
+/// Is set to the first path character unless the path is empty.
+/// If the path is empty, the position is either unchanged (`start`) or
+/// the first slash of an authority. So, if the path start is before a
+/// host start or scheme end, the path is empty.
+/// - [_queryStartIndex]: Should be initialized to `end`.
+/// The position of the `?` leading a query if the URI contains a query.
+/// - [_fragmentStartIndex]: Should be initialized to `end`.
+/// The position of the `#` leading a fragment if the URI contains a fragment.
+/// - [_notSimpleIndex]: Should be initialized to `start - 1`.
+/// Set to another value if the URI is considered "not simple".
+/// This is elaborated below.
+///
+/// # Simple URIs
+/// A URI is considered "simple" if it is in a normalized form containing no
+/// escapes. This allows us to skip normalization and checking whether escapes
+/// are valid, and to extract components without worrying about unescaping.
+///
+/// The scanner computes a conservative approximation of being "simple".
+/// It rejects any URI with an escape, with a user-info component (mainly
+/// because they are rare and would increase the number of states in the
+/// scanner significantly), with an IPV6 host or with a capital letter in
+/// the scheme or host name (the scheme is handled in a second scan using
+/// a separate two-state table).
+/// Further, paths containing `..` or `.` path segments are considered
+/// non-simple except for pure relative paths (no scheme or authority) starting
+/// with a sequence of "../" segments.
+///
+/// The transition tables cannot detect a trailing ".." in the path,
+/// followed by a query or fragment, because the segment is not known to be
+/// complete until we are past it, and we then need to store the query/fragment
+/// start instead. This cast is checked manually post-scanning (such a path
+/// needs to be normalized to end in "../", so the URI shouldn't be considered
+/// simple).
+List<Uint8List> _createTables() {
+ // Total number of states for the scanner.
+ const int stateCount = 22;
+
+ // States used to scan a URI from scratch.
+ const int schemeOrPath = 01;
+ const int authOrPath = 02;
+ const int authOrPathSlash = 03;
+ const int uinfoOrHost0 = 04;
+ const int uinfoOrHost = 05;
+ const int uinfoOrPort0 = 06;
+ const int uinfoOrPort = 07;
+ const int ipv6Host = 08;
+ const int relPathSeg = 09;
+ const int pathSeg = 10;
+ const int path = 11;
+ const int query = 12;
+ const int fragment = 13;
+ const int schemeOrPathDot = 14;
+ const int schemeOrPathDot2 = 15;
+ const int relPathSegDot = 16;
+ const int relPathSegDot2 = 17;
+ const int pathSegDot = 18;
+ const int pathSegDot2 = 19;
+
+ // States used to validate a scheme after its end position has been found.
+ const int scheme0 = _schemeStart;
+ const int scheme = 21;
+
+ // Constants encoding the write-index for the state transition into the top 5
+ // bits of a byte.
+ const int schemeEnd = _schemeEndIndex << 5;
+ const int hostStart = _hostStartIndex << 5;
+ const int portStart = _portStartIndex << 5;
+ const int pathStart = _pathStartIndex << 5;
+ const int queryStart = _queryStartIndex << 5;
+ const int fragmentStart = _fragmentStartIndex << 5;
+ const int notSimple = _notSimpleIndex << 5;
+
+ /// The `unreserved` characters of RFC 3986.
+ const unreserved =
+ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-._~";
+
+ /// The `sub-delim` characters of RFC 3986.
+ const subDelims = r"!$&'()*+,;=";
+ // The `pchar` characters of RFC 3986: characters that may occur in a path,
+ // excluding escapes.
+ const pchar = "$unreserved$subDelims";
+
+ var tables = List<Uint8List>.generate(stateCount, (_) => Uint8List(96));
+
+ // Helper function which initialize the table for [state] with a default
+ // transition and returns the table.
+ Uint8List build(state, defaultTransition) =>
+ tables[state]..fillRange(0, 96, defaultTransition);
+
+ // Helper function which sets the transition for each character in [chars]
+ // to [transition] in the [target] table.
+ // The [chars] string must contain only characters in the U+0020 .. U+007E
+ // range.
+ void setChars(Uint8List target, String chars, int transition) {
+ for (int i = 0; i < chars.length; i++) {
+ var char = chars.codeUnitAt(i);
+ target[char ^ 0x60] = transition;
+ }
+ }
+
+ /// Helper function which sets the transition for all characters in the
+ /// range from `range[0]` to `range[1]` to [transition] in the [target] table.
+ ///
+ /// The [range] must be a two-character string where both characters are in
+ /// the U+0020 .. U+007E range and the former character must have a lower
+ /// code point than the latter.
+ void setRange(Uint8List target, String range, int transition) {
+ for (int i = range.codeUnitAt(0), n = range.codeUnitAt(1); i <= n; i++) {
+ target[i ^ 0x60] = transition;
+ }
+ }
+
+ // Create the transitions for each state.
+ Uint8List b;
+
+ // Validate as path, if it is a scheme, we handle it later.
+ b = build(_uriStart, schemeOrPath | notSimple);
+ setChars(b, pchar, schemeOrPath);
+ setChars(b, ".", schemeOrPathDot);
+ setChars(b, ":", authOrPath | schemeEnd); // Handle later.
+ setChars(b, "/", authOrPathSlash);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(schemeOrPathDot, schemeOrPath | notSimple);
+ setChars(b, pchar, schemeOrPath);
+ setChars(b, ".", schemeOrPathDot2);
+ setChars(b, ':', authOrPath | schemeEnd);
+ setChars(b, "/", pathSeg | notSimple);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(schemeOrPathDot2, schemeOrPath | notSimple);
+ setChars(b, pchar, schemeOrPath);
+ setChars(b, "%", schemeOrPath | notSimple);
+ setChars(b, ':', authOrPath | schemeEnd);
+ setChars(b, "/", relPathSeg);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(schemeOrPath, schemeOrPath | notSimple);
+ setChars(b, pchar, schemeOrPath);
+ setChars(b, ':', authOrPath | schemeEnd);
+ setChars(b, "/", pathSeg);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(authOrPath, path | notSimple);
+ setChars(b, pchar, path | pathStart);
+ setChars(b, "/", authOrPathSlash | pathStart);
+ setChars(b, ".", pathSegDot | pathStart);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(authOrPathSlash, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, "/", uinfoOrHost0 | hostStart);
+ setChars(b, ".", pathSegDot);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(uinfoOrHost0, uinfoOrHost | notSimple);
+ setChars(b, pchar, uinfoOrHost);
+ setRange(b, "AZ", uinfoOrHost | notSimple);
+ setChars(b, ":", uinfoOrPort0 | portStart);
+ setChars(b, "@", uinfoOrHost0 | hostStart);
+ setChars(b, "[", ipv6Host | notSimple);
+ setChars(b, "/", pathSeg | pathStart);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(uinfoOrHost, uinfoOrHost | notSimple);
+ setChars(b, pchar, uinfoOrHost);
+ setRange(b, "AZ", uinfoOrHost | notSimple);
+ setChars(b, ":", uinfoOrPort0 | portStart);
+ setChars(b, "@", uinfoOrHost0 | hostStart);
+ setChars(b, "/", pathSeg | pathStart);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(uinfoOrPort0, uinfoOrPort | notSimple);
+ setRange(b, "19", uinfoOrPort);
+ setChars(b, "@", uinfoOrHost0 | hostStart);
+ setChars(b, "/", pathSeg | pathStart);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(uinfoOrPort, uinfoOrPort | notSimple);
+ setRange(b, "09", uinfoOrPort);
+ setChars(b, "@", uinfoOrHost0 | hostStart);
+ setChars(b, "/", pathSeg | pathStart);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(ipv6Host, ipv6Host);
+ setChars(b, "]", uinfoOrHost);
+
+ b = build(relPathSeg, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, ".", relPathSegDot);
+ setChars(b, "/", pathSeg | notSimple);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(relPathSegDot, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, ".", relPathSegDot2);
+ setChars(b, "/", pathSeg | notSimple);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(relPathSegDot2, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, "/", relPathSeg);
+ setChars(b, "?", query | queryStart); // This should be non-simple.
+ setChars(b, "#", fragment | fragmentStart); // This should be non-simple.
+
+ b = build(pathSeg, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, ".", pathSegDot);
+ setChars(b, "/", pathSeg | notSimple);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(pathSegDot, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, ".", pathSegDot2);
+ setChars(b, "/", pathSeg | notSimple);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(pathSegDot2, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, "/", pathSeg | notSimple);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(path, path | notSimple);
+ setChars(b, pchar, path);
+ setChars(b, "/", pathSeg);
+ setChars(b, "?", query | queryStart);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(query, query | notSimple);
+ setChars(b, pchar, query);
+ setChars(b, "?", query);
+ setChars(b, "#", fragment | fragmentStart);
+
+ b = build(fragment, fragment | notSimple);
+ setChars(b, pchar, fragment);
+ setChars(b, "?", fragment);
+
+ // A separate two-state validator for lower-case scheme names.
+ // Any non-scheme character or upper-case letter is marked as non-simple.
+ b = build(scheme0, scheme | notSimple);
+ setRange(b, "az", scheme);
+
+ b = build(scheme, scheme | notSimple);
+ setRange(b, "az", scheme);
+ setRange(b, "09", scheme);
+ setChars(b, "+-.", scheme);
+
+ return tables;
+}