tools/generate_uri_parser_tables.dart - sdk.git - Git at Google

 // Copyright (c) 2022, the Dart project authors.  Please see the AUTHORS file
 // for details. All rights reserved. Use of this source code is governed by a
 // BSD-style license that can be found in the LICENSE file.

 // ----------------------------------------------------------------------
 // Code to create the URI scanner table used by `uri.dart`.
 //
 // This file exists in case someone, some day, will want to change the
 // representation of the tables, maybe if Dart gets `Uint8List` literals.
 // It should not otherwise be necessary to re-generate the tables.
 //
 // The table is stored in the `uri.dart` file as a 1-byte string literal.
 // This script generates the string literal and prints it on stdout.
 // If passed the `-u filename` flag, it instead updates the file directly.
 // The file should be the `sdk/lib/core/uri.dart` file, which contains markers
 // showing where to insert the generated code.

 import "dart:io";
 import "dart:typed_data";

 /// Index of the position of that `:` after a scheme.
 const int _schemeEndIndex = 1;

 /// Index of the position of the character just before the host name.
 const int _hostStartIndex = 2;

 /// Index of the position of the `:` before a port value.
 const int _portStartIndex = 3;

 /// Index of the position of the first character of a path.
 const int _pathStartIndex = 4;

 /// Index of the position of the `?` before a query.
 const int _queryStartIndex = 5;

 /// Index of the position of the `#` before a fragment.
 const int _fragmentStartIndex = 6;

 /// Index of a position where the URI was determined to be "non-simple".
 const int _notSimpleIndex = 7;

 // Initial state for scanner.
 const int _uriStart = 0;

 // If scanning of a URI terminates in this state or above,
 // consider the URI non-simple
 const int _nonSimpleEndStates = 14;

 // Initial state for scheme validation.
 const int _schemeStart = 20;

 void main(List<String> args) {
   var tables = _createTables();
   var literalBuilder = StringLiteralBuilder("_scannerTables");
   for (var table in tables) {
     literalBuilder.writeBytes(table, hexAll: true);
   }
   var tableString = literalBuilder.close();

   var result = """
 // Use tools/generate_uri_parser_tables.dart to generate this code
 // if necessary.

 // --------------------------------------------------------------------
 // Constants used to read the scanner result.
 // The indices points into the table filled by [_scan] which contains
 // recognized positions in the scanned URI.
 // The `0` index is only used internally.

 /// Index of the position of that `:` after a scheme.
 const int _schemeEndIndex = $_schemeEndIndex;

 /// Index of the position of the character just before the host name.
 const int _hostStartIndex = $_hostStartIndex;

 /// Index of the position of the `:` before a port value.
 const int _portStartIndex = $_portStartIndex;

 /// Index of the position of the first character of a path.
 const int _pathStartIndex = $_pathStartIndex;

 /// Index of the position of the `?` before a query.
 const int _queryStartIndex = $_queryStartIndex;

 /// Index of the position of the `#` before a fragment.
 const int _fragmentStartIndex = $_fragmentStartIndex;

 /// Index of a position where the URI was determined to be "non-simple".
 const int _notSimpleIndex = $_notSimpleIndex;

 // Initial state for scanner.
 const int _uriStart = $_uriStart;

 // If scanning of a URI terminates in this state or above,
 // consider the URI non-simple
 const int _nonSimpleEndStates = $_nonSimpleEndStates;

 // Initial state for scheme validation.
 const int _schemeStart = $_schemeStart;

 // --------------------------------------------------------------------
 /// Transition tables are used to scan a URI to determine its structure.
 ///
 /// The tables represent a state machine with output.
 ///
 /// To scan the URI, start in the [_uriStart] state, then read each character
 /// of the URI in order, from start to end, and for each character perform a
 /// transition to a new state while writing the current position
 /// into the output buffer at a designated index.
 ///
 /// Each state, represented by an integer which is an index into
 /// [_scannerTables], has a set of transitions, one for each character.
 /// The transitions are encoded as a 5-bit integer representing the next state
 /// and a 3-bit index into the output table.
 ///
 /// For URI scanning, only characters in the range U+0020 through U+007E are
 /// interesting; all characters outside that range are treated the same.
 /// The tables only contain 96 entries, representing the 95 characters in the
 /// interesting range, and one entry for all values outside the range.
 /// The character entries are stored in one `String` of 96 characters per state,
 /// with the transition for a character at position `character ^ 0x60`,
 /// which maps the range U+0020 .. U+007F into positions 0 .. 95.
 /// All remaining characters are mapped to position 31 (`0x7f ^ 0x60`), which
 /// represents the transition for all remaining characters.
 $tableString
 // --------------------------------------------------------------------
 /// Scan a string using the [_scannerTables] state machine.
 ///
 /// Scans [uri] from [start] to [end], starting in state [state] and
 /// writing output into [indices].
 ///
 /// Returns the final state.
 int _scan(String uri, int start, int end, int state, List<int> indices) {
   const int stateTableSize = 96;
   assert(end <= uri.length);
   for (int i = start; i < end; i++) {
     // Xor with 0x60 to move range 0x20-0x7f into 0x00-0x5f
     int char = uri.codeUnitAt(i) ^ 0x60;
     // Use 0x1f (nee 0x7f) to represent all unhandled characters.
     if (char > 0x5f) char = 0x1f;
     int transition = _scannerTables.codeUnitAt(state * stateTableSize + char);
     state = transition & 0x1f;
     indices[transition >> 5] = i;
   }
   return state;
 }
 """;
   if (args.isEmpty || !args.first.startsWith("-u")) {
     print(result);
     return;
   }
   var arg = args.first;
   var filePath = "sdk/lib/core/uri.dart";
   // Default file location, if run from root of SDK.
   if (arg.length > 2) {
     filePath = arg.substring(2);
   } else if (args.length > 1) {
     filePath = args[1];
   }
   var file = File(filePath);
   if (!file.existsSync()) {
     stderr.writeln("Cannot find file: $filePath");
     exit(1);
   }
   var contents = file.readAsStringSync();
   var pattern = RegExp(r"^// --- URI PARSER TABLE --- (start|end) --- [^]*?^",
       multiLine: true);
   var matches = pattern.allMatches(contents).toList();
   if (matches.length != 2) {
     stderr.writeln("Cannot find marked section in file $filePath");
     exit(1);
   }
   var start = matches.first.end;
   var end = matches.last.start;
   var newContents = contents.replaceRange(start, end, result);
   if (newContents != contents) {
     file.writeAsStringSync(newContents);
     print("$filePath updated.");
   } else {
     stderr.writeln("No update needed.");
     return;
   }
 }

 /// Creates a literal of the form
 /// ```dart
 /// const String someName = "ab\x82azx......"
 ///     "more bytes and escapes \xff        "
 ///     "....";
 /// ```
 /// while escaping non-printable charactes, `"`, `$` and `\`,
 /// and trying to fit as many characters on each line as possible.
 class StringLiteralBuilder {
   final buffer = StringBuffer();
   String indent;
   var lineLength = 0;
   StringLiteralBuilder(String name, {int indent = 0})
       : indent = " " * (indent + 4) {
     if (indent > 0) buffer.write(" " * indent);
     buffer
       ..write("const String ")
       ..write(name)
       ..write(" = \"");
     lineLength = buffer.length;
   }

   void writeBytes(Uint8List bytes, {bool hexAll = false}) {
     for (var byte in bytes) {
       var string = hexAll ? hex(byte) : charString(byte);
       lineLength += string.length;
       if (lineLength > 79) {
         buffer
           ..write('"\n')
           ..write(indent)
           ..write('"');
         lineLength = indent.length + 1 + string.length;
       }
       buffer.write(string);
     }
   }

   /// Terminates the string literal.
   ///
   /// Do not call use builder after calling close.
   String close() {
     if (lineLength < 78) {
       buffer.write("\";\n");
     } else {
       buffer
         ..write("\"\n")
         ..write(indent)
         ..write(";\n");
     }
     return buffer.toString();
   }

   static String charString(int byte) {
     // Recognized characters that need escaping, or has a short escape.
     switch (byte) {
       case 0x08:
         return r"\b";
       case 0x09:
         return r"\t";
       case 0x0a:
         return r"\n";
       case 0x0b:
         return r"\v";
       case 0x0c:
         return r"\f";
       case 0x0d:
         return r"\r";
       case 0x22:
         return r'\"';
       case 0x5c:
         return r"\\";
       case 0x24:
         return r"\$";
     }
     // All control characters.
     if (byte & 0x60 == 0 || byte == 0x7F) {
       // 0x00 - 0x1F, 0x80 - 0xBF, 0x7F
       return hex(byte);
     }
     return String.fromCharCode(byte);
   }

   static String hex(int byte) {
     const digits = "0123456789ABCDEF";
     return "\\x${digits[byte >> 4]}${digits[byte & 0xf]}";
   }
 }

 /// Creates the tables for [_scannerTables] used by [Uri.parse].
 ///
 /// See [_scannerTables] for the generated format.
 ///
 /// The concrete tables are chosen as a trade-off between the number of states
 /// needed and the precision of the result.
 /// This allows definitely recognizing the general structure of the URI
 /// (presence and location of scheme, user-info, host, port, path, query and
 /// fragment) while at the same time detecting that some components are not
 /// in canonical form (anything containing a `%`, a host-name containing a
 /// capital letter). Since the scanner doesn't know whether something is a
 /// scheme or a path until it sees `:`, or user-info or host until it sees
 /// a `@`, a second pass is needed to validate the scheme and any user-info
 /// is considered non-canonical by default.
 ///
 /// The states (starting from [_uriStart]) write positions while scanning
 /// a string from `start` to `end` as follows:
 ///
 /// - [_schemeEndIndex]: Should be initialized to `start-1`.
 ///   If the URI has a scheme, it is set to the position of the `:` after
 ///   the scheme.
 /// - [_hostStartIndex]: Should be initialized to `start - 1`.
 ///   If the URI has an authority, it is set to the character before the
 ///   host name - either the second `/` in the `//` leading the authority,
 ///   or the `@` after a user-info. Comparing this value to the scheme end
 ///   position can be used to detect that there is a user-info component.
 /// - [_portStartIndex]: Should be initialized to `start`.
 ///   Set to the position of the last `:` in an authority, and unchanged
 ///   if there is no authority or no `:` in an authority.
 ///   If this position is after the host start, there is a port, otherwise it
 ///   is just marking a colon in the user-info component.
 /// - [_pathStartIndex]: Should be initialized to `start`.
 ///   Is set to the first path character unless the path is empty.
 ///   If the path is empty, the position is either unchanged (`start`) or
 ///   the first slash of an authority. So, if the path start is before a
 ///   host start or scheme end, the path is empty.
 /// - [_queryStartIndex]: Should be initialized to `end`.
 ///   The position of the `?` leading a query if the URI contains a query.
 /// - [_fragmentStartIndex]: Should be initialized to `end`.
 ///   The position of the `#` leading a fragment if the URI contains a fragment.
 /// - [_notSimpleIndex]: Should be initialized to `start - 1`.
 ///   Set to another value if the URI is considered "not simple".
 ///   This is elaborated below.
 ///
 /// # Simple URIs
 /// A URI is considered "simple" if it is in a normalized form containing no
 /// escapes. This allows us to skip normalization and checking whether escapes
 /// are valid, and to extract components without worrying about unescaping.
 ///
 /// The scanner computes a conservative approximation of being "simple".
 /// It rejects any URI with an escape, with a user-info component (mainly
 /// because they are rare and would increase the number of states in the
 /// scanner significantly), with an IPV6 host or with a capital letter in
 /// the scheme or host name (the scheme is handled in a second scan using
 /// a separate two-state table).
 /// Further, paths containing `..` or `.` path segments are considered
 /// non-simple except for pure relative paths (no scheme or authority) starting
 /// with a sequence of "../" segments.
 ///
 /// The transition tables cannot detect a trailing ".." in the path,
 /// followed by a query or fragment, because the segment is not known to be
 /// complete until we are past it, and we then need to store the query/fragment
 /// start instead. This cast is checked manually post-scanning (such a path
 /// needs to be normalized to end in "../", so the URI shouldn't be considered
 /// simple).
 List<Uint8List> _createTables() {
   // Total number of states for the scanner.
   const int stateCount = 22;

   // States used to scan a URI from scratch.
   const int schemeOrPath = 01;
   const int authOrPath = 02;
   const int authOrPathSlash = 03;
   const int uinfoOrHost0 = 04;
   const int uinfoOrHost = 05;
   const int uinfoOrPort0 = 06;
   const int uinfoOrPort = 07;
   const int ipv6Host = 08;
   const int relPathSeg = 09;
   const int pathSeg = 10;
   const int path = 11;
   const int query = 12;
   const int fragment = 13;
   const int schemeOrPathDot = 14;
   const int schemeOrPathDot2 = 15;
   const int relPathSegDot = 16;
   const int relPathSegDot2 = 17;
   const int pathSegDot = 18;
   const int pathSegDot2 = 19;

   // States used to validate a scheme after its end position has been found.
   const int scheme0 = _schemeStart;
   const int scheme = 21;

   // Constants encoding the write-index for the state transition into the top 5
   // bits of a byte.
   const int schemeEnd = _schemeEndIndex << 5;
   const int hostStart = _hostStartIndex << 5;
   const int portStart = _portStartIndex << 5;
   const int pathStart = _pathStartIndex << 5;
   const int queryStart = _queryStartIndex << 5;
   const int fragmentStart = _fragmentStartIndex << 5;
   const int notSimple = _notSimpleIndex << 5;

   /// The `unreserved` characters of RFC 3986.
   const unreserved =
       "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-._~";

   /// The `sub-delim` characters of RFC 3986.
   const subDelims = r"!$&'()*+,;=";
   // The `pchar` characters of RFC 3986: characters that may occur in a path,
   // excluding escapes.
   const pchar = "$unreserved$subDelims";

   var tables = List<Uint8List>.generate(stateCount, (_) => Uint8List(96));

   // Helper function which initialize the table for [state] with a default
   // transition and returns the table.
   Uint8List build(state, defaultTransition) =>
       tables[state]..fillRange(0, 96, defaultTransition);

   // Helper function which sets the transition for each character in [chars]
   // to [transition] in the [target] table.
   // The [chars] string must contain only characters in the U+0020 .. U+007E
   // range.
   void setChars(Uint8List target, String chars, int transition) {
     for (int i = 0; i < chars.length; i++) {
       var char = chars.codeUnitAt(i);
       target[char ^ 0x60] = transition;
     }
   }

   /// Helper function which sets the transition for all characters in the
   /// range from `range[0]` to `range[1]` to [transition] in the [target] table.
   ///
   /// The [range] must be a two-character string where both characters are in
   /// the U+0020 .. U+007E range and the former character must have a lower
   /// code point than the latter.
   void setRange(Uint8List target, String range, int transition) {
     for (int i = range.codeUnitAt(0), n = range.codeUnitAt(1); i <= n; i++) {
       target[i ^ 0x60] = transition;
     }
   }

   // Create the transitions for each state.
   Uint8List b;

   // Validate as path, if it is a scheme, we handle it later.
   b = build(_uriStart, schemeOrPath | notSimple);
   setChars(b, pchar, schemeOrPath);
   setChars(b, ".", schemeOrPathDot);
   setChars(b, ":", authOrPath | schemeEnd); // Handle later.
   setChars(b, "/", authOrPathSlash);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(schemeOrPathDot, schemeOrPath | notSimple);
   setChars(b, pchar, schemeOrPath);
   setChars(b, ".", schemeOrPathDot2);
   setChars(b, ':', authOrPath | schemeEnd);
   setChars(b, "/", pathSeg | notSimple);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(schemeOrPathDot2, schemeOrPath | notSimple);
   setChars(b, pchar, schemeOrPath);
   setChars(b, "%", schemeOrPath | notSimple);
   setChars(b, ':', authOrPath | schemeEnd);
   setChars(b, "/", relPathSeg);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(schemeOrPath, schemeOrPath | notSimple);
   setChars(b, pchar, schemeOrPath);
   setChars(b, ':', authOrPath | schemeEnd);
   setChars(b, "/", pathSeg);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(authOrPath, path | notSimple);
   setChars(b, pchar, path | pathStart);
   setChars(b, "/", authOrPathSlash | pathStart);
   setChars(b, ".", pathSegDot | pathStart);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(authOrPathSlash, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, "/", uinfoOrHost0 | hostStart);
   setChars(b, ".", pathSegDot);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(uinfoOrHost0, uinfoOrHost | notSimple);
   setChars(b, pchar, uinfoOrHost);
   setRange(b, "AZ", uinfoOrHost | notSimple);
   setChars(b, ":", uinfoOrPort0 | portStart);
   setChars(b, "@", uinfoOrHost0 | hostStart);
   setChars(b, "[", ipv6Host | notSimple);
   setChars(b, "/", pathSeg | pathStart);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(uinfoOrHost, uinfoOrHost | notSimple);
   setChars(b, pchar, uinfoOrHost);
   setRange(b, "AZ", uinfoOrHost | notSimple);
   setChars(b, ":", uinfoOrPort0 | portStart);
   setChars(b, "@", uinfoOrHost0 | hostStart);
   setChars(b, "/", pathSeg | pathStart);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(uinfoOrPort0, uinfoOrPort | notSimple);
   setRange(b, "19", uinfoOrPort);
   setChars(b, "@", uinfoOrHost0 | hostStart);
   setChars(b, "/", pathSeg | pathStart);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(uinfoOrPort, uinfoOrPort | notSimple);
   setRange(b, "09", uinfoOrPort);
   setChars(b, "@", uinfoOrHost0 | hostStart);
   setChars(b, "/", pathSeg | pathStart);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(ipv6Host, ipv6Host);
   setChars(b, "]", uinfoOrHost);

   b = build(relPathSeg, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, ".", relPathSegDot);
   setChars(b, "/", pathSeg | notSimple);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(relPathSegDot, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, ".", relPathSegDot2);
   setChars(b, "/", pathSeg | notSimple);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(relPathSegDot2, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, "/", relPathSeg);
   setChars(b, "?", query | queryStart); // This should be non-simple.
   setChars(b, "#", fragment | fragmentStart); // This should be non-simple.

   b = build(pathSeg, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, ".", pathSegDot);
   setChars(b, "/", pathSeg | notSimple);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(pathSegDot, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, ".", pathSegDot2);
   setChars(b, "/", pathSeg | notSimple);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(pathSegDot2, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, "/", pathSeg | notSimple);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(path, path | notSimple);
   setChars(b, pchar, path);
   setChars(b, "/", pathSeg);
   setChars(b, "?", query | queryStart);
   setChars(b, "#", fragment | fragmentStart);

   b = build(query, query | notSimple);
   setChars(b, pchar, query);
   setChars(b, "?", query);
   setChars(b, "#", fragment | fragmentStart);

   b = build(fragment, fragment | notSimple);
   setChars(b, pchar, fragment);
   setChars(b, "?", fragment);

   // A separate two-state validator for lower-case scheme names.
   // Any non-scheme character or upper-case letter is marked as non-simple.
   b = build(scheme0, scheme | notSimple);
   setRange(b, "az", scheme);

   b = build(scheme, scheme | notSimple);
   setRange(b, "az", scheme);
   setRange(b, "09", scheme);
   setChars(b, "+-.", scheme);

   return tables;
 }
	// Copyright (c) 2022, the Dart project authors. Please see the AUTHORS file
	// for details. All rights reserved. Use of this source code is governed by a
	// BSD-style license that can be found in the LICENSE file.

	// ----------------------------------------------------------------------
	// Code to create the URI scanner table used by `uri.dart`.
	//
	// This file exists in case someone, some day, will want to change the
	// representation of the tables, maybe if Dart gets `Uint8List` literals.
	// It should not otherwise be necessary to re-generate the tables.
	//
	// The table is stored in the `uri.dart` file as a 1-byte string literal.
	// This script generates the string literal and prints it on stdout.
	// If passed the `-u filename` flag, it instead updates the file directly.
	// The file should be the `sdk/lib/core/uri.dart` file, which contains markers
	// showing where to insert the generated code.

	import "dart:io";
	import "dart:typed_data";

	/// Index of the position of that `:` after a scheme.
	const int _schemeEndIndex = 1;

	/// Index of the position of the character just before the host name.
	const int _hostStartIndex = 2;

	/// Index of the position of the `:` before a port value.
	const int _portStartIndex = 3;

	/// Index of the position of the first character of a path.
	const int _pathStartIndex = 4;

	/// Index of the position of the `?` before a query.
	const int _queryStartIndex = 5;

	/// Index of the position of the `#` before a fragment.
	const int _fragmentStartIndex = 6;

	/// Index of a position where the URI was determined to be "non-simple".
	const int _notSimpleIndex = 7;

	// Initial state for scanner.
	const int _uriStart = 0;

	// If scanning of a URI terminates in this state or above,
	// consider the URI non-simple
	const int _nonSimpleEndStates = 14;

	// Initial state for scheme validation.
	const int _schemeStart = 20;

	void main(List<String> args) {
	var tables = _createTables();
	var literalBuilder = StringLiteralBuilder("_scannerTables");
	for (var table in tables) {
	literalBuilder.writeBytes(table, hexAll: true);
	}
	var tableString = literalBuilder.close();

	var result = """
	// Use tools/generate_uri_parser_tables.dart to generate this code
	// if necessary.

	// --------------------------------------------------------------------
	// Constants used to read the scanner result.
	// The indices points into the table filled by [_scan] which contains
	// recognized positions in the scanned URI.
	// The `0` index is only used internally.

	/// Index of the position of that `:` after a scheme.
	const int _schemeEndIndex = $_schemeEndIndex;

	/// Index of the position of the character just before the host name.
	const int _hostStartIndex = $_hostStartIndex;

	/// Index of the position of the `:` before a port value.
	const int _portStartIndex = $_portStartIndex;

	/// Index of the position of the first character of a path.
	const int _pathStartIndex = $_pathStartIndex;

	/// Index of the position of the `?` before a query.
	const int _queryStartIndex = $_queryStartIndex;

	/// Index of the position of the `#` before a fragment.
	const int _fragmentStartIndex = $_fragmentStartIndex;

	/// Index of a position where the URI was determined to be "non-simple".
	const int _notSimpleIndex = $_notSimpleIndex;

	// Initial state for scanner.
	const int _uriStart = $_uriStart;

	// If scanning of a URI terminates in this state or above,
	// consider the URI non-simple
	const int _nonSimpleEndStates = $_nonSimpleEndStates;

	// Initial state for scheme validation.
	const int _schemeStart = $_schemeStart;

	// --------------------------------------------------------------------
	/// Transition tables are used to scan a URI to determine its structure.
	///
	/// The tables represent a state machine with output.
	///
	/// To scan the URI, start in the [_uriStart] state, then read each character
	/// of the URI in order, from start to end, and for each character perform a
	/// transition to a new state while writing the current position
	/// into the output buffer at a designated index.
	///
	/// Each state, represented by an integer which is an index into
	/// [_scannerTables], has a set of transitions, one for each character.
	/// The transitions are encoded as a 5-bit integer representing the next state
	/// and a 3-bit index into the output table.
	///
	/// For URI scanning, only characters in the range U+0020 through U+007E are
	/// interesting; all characters outside that range are treated the same.
	/// The tables only contain 96 entries, representing the 95 characters in the
	/// interesting range, and one entry for all values outside the range.
	/// The character entries are stored in one `String` of 96 characters per state,
	/// with the transition for a character at position `character ^ 0x60`,
	/// which maps the range U+0020 .. U+007F into positions 0 .. 95.
	/// All remaining characters are mapped to position 31 (`0x7f ^ 0x60`), which
	/// represents the transition for all remaining characters.
	$tableString
	// --------------------------------------------------------------------
	/// Scan a string using the [_scannerTables] state machine.
	///
	/// Scans [uri] from [start] to [end], starting in state [state] and
	/// writing output into [indices].
	///
	/// Returns the final state.
	int _scan(String uri, int start, int end, int state, List<int> indices) {
	const int stateTableSize = 96;
	assert(end <= uri.length);
	for (int i = start; i < end; i++) {
	// Xor with 0x60 to move range 0x20-0x7f into 0x00-0x5f
	int char = uri.codeUnitAt(i) ^ 0x60;
	// Use 0x1f (nee 0x7f) to represent all unhandled characters.
	if (char > 0x5f) char = 0x1f;
	int transition = _scannerTables.codeUnitAt(state * stateTableSize + char);
	state = transition & 0x1f;
	indices[transition >> 5] = i;
	}
	return state;
	}
	""";
	if (args.isEmpty \|\| !args.first.startsWith("-u")) {
	print(result);
	return;
	}
	var arg = args.first;
	var filePath = "sdk/lib/core/uri.dart";
	// Default file location, if run from root of SDK.
	if (arg.length > 2) {
	filePath = arg.substring(2);
	} else if (args.length > 1) {
	filePath = args[1];
	}
	var file = File(filePath);
	if (!file.existsSync()) {
	stderr.writeln("Cannot find file: $filePath");
	exit(1);
	}
	var contents = file.readAsStringSync();
	var pattern = RegExp(r"^// --- URI PARSER TABLE --- (start\|end) --- [^]*?^",
	multiLine: true);
	var matches = pattern.allMatches(contents).toList();
	if (matches.length != 2) {
	stderr.writeln("Cannot find marked section in file $filePath");
	exit(1);
	}
	var start = matches.first.end;
	var end = matches.last.start;
	var newContents = contents.replaceRange(start, end, result);
	if (newContents != contents) {
	file.writeAsStringSync(newContents);
	print("$filePath updated.");
	} else {
	stderr.writeln("No update needed.");
	return;
	}
	}

	/// Creates a literal of the form
	/// ```dart
	/// const String someName = "ab\x82azx......"
	/// "more bytes and escapes \xff "
	/// "....";
	/// ```
	/// while escaping non-printable charactes, `"`, `$` and `\`,
	/// and trying to fit as many characters on each line as possible.
	class StringLiteralBuilder {
	final buffer = StringBuffer();
	String indent;
	var lineLength = 0;
	StringLiteralBuilder(String name, {int indent = 0})
	: indent = " " * (indent + 4) {
	if (indent > 0) buffer.write(" " * indent);
	buffer
	..write("const String ")
	..write(name)
	..write(" = \"");
	lineLength = buffer.length;
	}

	void writeBytes(Uint8List bytes, {bool hexAll = false}) {
	for (var byte in bytes) {
	var string = hexAll ? hex(byte) : charString(byte);
	lineLength += string.length;
	if (lineLength > 79) {
	buffer
	..write('"\n')
	..write(indent)
	..write('"');
	lineLength = indent.length + 1 + string.length;
	}
	buffer.write(string);
	}
	}

	/// Terminates the string literal.
	///
	/// Do not call use builder after calling close.
	String close() {
	if (lineLength < 78) {
	buffer.write("\";\n");
	} else {
	buffer
	..write("\"\n")
	..write(indent)
	..write(";\n");
	}
	return buffer.toString();
	}

	static String charString(int byte) {
	// Recognized characters that need escaping, or has a short escape.
	switch (byte) {
	case 0x08:
	return r"\b";
	case 0x09:
	return r"\t";
	case 0x0a:
	return r"\n";
	case 0x0b:
	return r"\v";
	case 0x0c:
	return r"\f";
	case 0x0d:
	return r"\r";
	case 0x22:
	return r'\"';
	case 0x5c:
	return r"\\";
	case 0x24:
	return r"\$";
	}
	// All control characters.
	if (byte & 0x60 == 0 \|\| byte == 0x7F) {
	// 0x00 - 0x1F, 0x80 - 0xBF, 0x7F
	return hex(byte);
	}
	return String.fromCharCode(byte);
	}

	static String hex(int byte) {
	const digits = "0123456789ABCDEF";
	return "\\x${digits[byte >> 4]}${digits[byte & 0xf]}";
	}
	}

	/// Creates the tables for [_scannerTables] used by [Uri.parse].
	///
	/// See [_scannerTables] for the generated format.
	///
	/// The concrete tables are chosen as a trade-off between the number of states
	/// needed and the precision of the result.
	/// This allows definitely recognizing the general structure of the URI
	/// (presence and location of scheme, user-info, host, port, path, query and
	/// fragment) while at the same time detecting that some components are not
	/// in canonical form (anything containing a `%`, a host-name containing a
	/// capital letter). Since the scanner doesn't know whether something is a
	/// scheme or a path until it sees `:`, or user-info or host until it sees
	/// a `@`, a second pass is needed to validate the scheme and any user-info
	/// is considered non-canonical by default.
	///
	/// The states (starting from [_uriStart]) write positions while scanning
	/// a string from `start` to `end` as follows:
	///
	/// - [_schemeEndIndex]: Should be initialized to `start-1`.
	/// If the URI has a scheme, it is set to the position of the `:` after
	/// the scheme.
	/// - [_hostStartIndex]: Should be initialized to `start - 1`.
	/// If the URI has an authority, it is set to the character before the
	/// host name - either the second `/` in the `//` leading the authority,
	/// or the `@` after a user-info. Comparing this value to the scheme end
	/// position can be used to detect that there is a user-info component.
	/// - [_portStartIndex]: Should be initialized to `start`.
	/// Set to the position of the last `:` in an authority, and unchanged
	/// if there is no authority or no `:` in an authority.
	/// If this position is after the host start, there is a port, otherwise it
	/// is just marking a colon in the user-info component.
	/// - [_pathStartIndex]: Should be initialized to `start`.
	/// Is set to the first path character unless the path is empty.
	/// If the path is empty, the position is either unchanged (`start`) or
	/// the first slash of an authority. So, if the path start is before a
	/// host start or scheme end, the path is empty.
	/// - [_queryStartIndex]: Should be initialized to `end`.
	/// The position of the `?` leading a query if the URI contains a query.
	/// - [_fragmentStartIndex]: Should be initialized to `end`.
	/// The position of the `#` leading a fragment if the URI contains a fragment.
	/// - [_notSimpleIndex]: Should be initialized to `start - 1`.
	/// Set to another value if the URI is considered "not simple".
	/// This is elaborated below.
	///
	/// # Simple URIs
	/// A URI is considered "simple" if it is in a normalized form containing no
	/// escapes. This allows us to skip normalization and checking whether escapes
	/// are valid, and to extract components without worrying about unescaping.
	///
	/// The scanner computes a conservative approximation of being "simple".
	/// It rejects any URI with an escape, with a user-info component (mainly
	/// because they are rare and would increase the number of states in the
	/// scanner significantly), with an IPV6 host or with a capital letter in
	/// the scheme or host name (the scheme is handled in a second scan using
	/// a separate two-state table).
	/// Further, paths containing `..` or `.` path segments are considered
	/// non-simple except for pure relative paths (no scheme or authority) starting
	/// with a sequence of "../" segments.
	///
	/// The transition tables cannot detect a trailing ".." in the path,
	/// followed by a query or fragment, because the segment is not known to be
	/// complete until we are past it, and we then need to store the query/fragment
	/// start instead. This cast is checked manually post-scanning (such a path
	/// needs to be normalized to end in "../", so the URI shouldn't be considered
	/// simple).
	List<Uint8List> _createTables() {
	// Total number of states for the scanner.
	const int stateCount = 22;

	// States used to scan a URI from scratch.
	const int schemeOrPath = 01;
	const int authOrPath = 02;
	const int authOrPathSlash = 03;
	const int uinfoOrHost0 = 04;
	const int uinfoOrHost = 05;
	const int uinfoOrPort0 = 06;
	const int uinfoOrPort = 07;
	const int ipv6Host = 08;
	const int relPathSeg = 09;
	const int pathSeg = 10;
	const int path = 11;
	const int query = 12;
	const int fragment = 13;
	const int schemeOrPathDot = 14;
	const int schemeOrPathDot2 = 15;
	const int relPathSegDot = 16;
	const int relPathSegDot2 = 17;
	const int pathSegDot = 18;
	const int pathSegDot2 = 19;

	// States used to validate a scheme after its end position has been found.
	const int scheme0 = _schemeStart;
	const int scheme = 21;

	// Constants encoding the write-index for the state transition into the top 5
	// bits of a byte.
	const int schemeEnd = _schemeEndIndex << 5;
	const int hostStart = _hostStartIndex << 5;
	const int portStart = _portStartIndex << 5;
	const int pathStart = _pathStartIndex << 5;
	const int queryStart = _queryStartIndex << 5;
	const int fragmentStart = _fragmentStartIndex << 5;
	const int notSimple = _notSimpleIndex << 5;

	/// The `unreserved` characters of RFC 3986.
	const unreserved =
	"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-._~";

	/// The `sub-delim` characters of RFC 3986.
	const subDelims = r"!$&'()*+,;=";
	// The `pchar` characters of RFC 3986: characters that may occur in a path,
	// excluding escapes.
	const pchar = "$unreserved$subDelims";

	var tables = List<Uint8List>.generate(stateCount, (_) => Uint8List(96));

	// Helper function which initialize the table for [state] with a default
	// transition and returns the table.
	Uint8List build(state, defaultTransition) =>
	tables[state]..fillRange(0, 96, defaultTransition);

	// Helper function which sets the transition for each character in [chars]
	// to [transition] in the [target] table.
	// The [chars] string must contain only characters in the U+0020 .. U+007E
	// range.
	void setChars(Uint8List target, String chars, int transition) {
	for (int i = 0; i < chars.length; i++) {
	var char = chars.codeUnitAt(i);
	target[char ^ 0x60] = transition;
	}
	}

	/// Helper function which sets the transition for all characters in the
	/// range from `range[0]` to `range[1]` to [transition] in the [target] table.
	///
	/// The [range] must be a two-character string where both characters are in
	/// the U+0020 .. U+007E range and the former character must have a lower
	/// code point than the latter.
	void setRange(Uint8List target, String range, int transition) {
	for (int i = range.codeUnitAt(0), n = range.codeUnitAt(1); i <= n; i++) {
	target[i ^ 0x60] = transition;
	}
	}

	// Create the transitions for each state.
	Uint8List b;

	// Validate as path, if it is a scheme, we handle it later.
	b = build(_uriStart, schemeOrPath \| notSimple);
	setChars(b, pchar, schemeOrPath);
	setChars(b, ".", schemeOrPathDot);
	setChars(b, ":", authOrPath \| schemeEnd); // Handle later.
	setChars(b, "/", authOrPathSlash);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(schemeOrPathDot, schemeOrPath \| notSimple);
	setChars(b, pchar, schemeOrPath);
	setChars(b, ".", schemeOrPathDot2);
	setChars(b, ':', authOrPath \| schemeEnd);
	setChars(b, "/", pathSeg \| notSimple);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(schemeOrPathDot2, schemeOrPath \| notSimple);
	setChars(b, pchar, schemeOrPath);
	setChars(b, "%", schemeOrPath \| notSimple);
	setChars(b, ':', authOrPath \| schemeEnd);
	setChars(b, "/", relPathSeg);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(schemeOrPath, schemeOrPath \| notSimple);
	setChars(b, pchar, schemeOrPath);
	setChars(b, ':', authOrPath \| schemeEnd);
	setChars(b, "/", pathSeg);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(authOrPath, path \| notSimple);
	setChars(b, pchar, path \| pathStart);
	setChars(b, "/", authOrPathSlash \| pathStart);
	setChars(b, ".", pathSegDot \| pathStart);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(authOrPathSlash, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, "/", uinfoOrHost0 \| hostStart);
	setChars(b, ".", pathSegDot);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(uinfoOrHost0, uinfoOrHost \| notSimple);
	setChars(b, pchar, uinfoOrHost);
	setRange(b, "AZ", uinfoOrHost \| notSimple);
	setChars(b, ":", uinfoOrPort0 \| portStart);
	setChars(b, "@", uinfoOrHost0 \| hostStart);
	setChars(b, "[", ipv6Host \| notSimple);
	setChars(b, "/", pathSeg \| pathStart);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(uinfoOrHost, uinfoOrHost \| notSimple);
	setChars(b, pchar, uinfoOrHost);
	setRange(b, "AZ", uinfoOrHost \| notSimple);
	setChars(b, ":", uinfoOrPort0 \| portStart);
	setChars(b, "@", uinfoOrHost0 \| hostStart);
	setChars(b, "/", pathSeg \| pathStart);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(uinfoOrPort0, uinfoOrPort \| notSimple);
	setRange(b, "19", uinfoOrPort);
	setChars(b, "@", uinfoOrHost0 \| hostStart);
	setChars(b, "/", pathSeg \| pathStart);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(uinfoOrPort, uinfoOrPort \| notSimple);
	setRange(b, "09", uinfoOrPort);
	setChars(b, "@", uinfoOrHost0 \| hostStart);
	setChars(b, "/", pathSeg \| pathStart);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(ipv6Host, ipv6Host);
	setChars(b, "]", uinfoOrHost);

	b = build(relPathSeg, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, ".", relPathSegDot);
	setChars(b, "/", pathSeg \| notSimple);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(relPathSegDot, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, ".", relPathSegDot2);
	setChars(b, "/", pathSeg \| notSimple);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(relPathSegDot2, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, "/", relPathSeg);
	setChars(b, "?", query \| queryStart); // This should be non-simple.
	setChars(b, "#", fragment \| fragmentStart); // This should be non-simple.

	b = build(pathSeg, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, ".", pathSegDot);
	setChars(b, "/", pathSeg \| notSimple);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(pathSegDot, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, ".", pathSegDot2);
	setChars(b, "/", pathSeg \| notSimple);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(pathSegDot2, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, "/", pathSeg \| notSimple);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(path, path \| notSimple);
	setChars(b, pchar, path);
	setChars(b, "/", pathSeg);
	setChars(b, "?", query \| queryStart);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(query, query \| notSimple);
	setChars(b, pchar, query);
	setChars(b, "?", query);
	setChars(b, "#", fragment \| fragmentStart);

	b = build(fragment, fragment \| notSimple);
	setChars(b, pchar, fragment);
	setChars(b, "?", fragment);

	// A separate two-state validator for lower-case scheme names.
	// Any non-scheme character or upper-case letter is marked as non-simple.
	b = build(scheme0, scheme \| notSimple);
	setRange(b, "az", scheme);

	b = build(scheme, scheme \| notSimple);
	setRange(b, "az", scheme);
	setRange(b, "09", scheme);
	setChars(b, "+-.", scheme);

	return tables;
	}