[CFE] Improve [StringCanonicalizer] implementation The current [StringCanonicalizer] implementation has some issues: * It hangs on to large [Uint8List]/[String] objects in it's cache => This requires users (such as analyzer) to clear the cache frequently * Has api that works on dynamic input (which is assumed to be String or List<int> / Uint8List) => Call sites have typing information we loose when doing the call * It uses dynamic [] calls to compare input bytes with cached bytes / input strings with cached strings => Dynamic calls come with overhead => Will cause substring generation for every character comparison => Will compare bytes with strings (which doesn't make sense) To address these issues we * Use the canonicalized [String] to compare against instead of the (much larger) source strings, thereby no longer hanging on to large strings in the canonicalizer cache (it's still an issue with [Uint8List]s though) * Make seperate API for canonicalization of strings, sub-strings or sub-utf8-bytes and use it from the token implementation. * For canonicalization of strings use String.== (instead of char-by-char comparison) * For canonicalization of sub-strings use String.charCodeAt instead of [] (which creates substrings) * Seperate out cache node entries into two classes and reduce memory consumption of the nodes that represent strings by 16 bytes (it does an additional `is` check on lookups in the cache, but that is better than paying for dynamic calls on the payload - which causes the compiler to do implicit checks) => This CL reduces RAM consumption and makes CFE scan/scan_bytes benchmarks a little faster. TEST=ci Change-Id: I157c298d26d25ac5da82c32eedfa270a590156f0 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/255121 Commit-Queue: Martin Kustermann <kustermann@google.com> Reviewed-by: Jens Johansen <jensj@google.com>

commit: 9a3bcbcdb0b350bb41c60b1a9c619836ac8a7736 [log] [tgz]
author: Martin Kustermann <kustermann@google.com> Mon Aug 15 17:06:58 2022 +0000
committer: Commit Bot <commit-bot@chromium.org> Mon Aug 15 17:06:58 2022 +0000
tree: 1dd9f20d877d3299cdecc2b5071c32deca6d7399
parent: d35332351079d10077fcacb0d820b2265119fe72 [diff]
diff --git a/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart b/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart
index 966e811..68506c0 100644
--- a/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart
+++ b/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart

@@ -6,13 +6,30 @@
 
 import 'dart:convert';
 
-class Node {
-  dynamic /* String | List<int> */ data;
-  int start;
-  int end;
-  String payload;
+abstract class Node {
+  final String payload;
   Node? next;
-  Node(this.data, this.start, this.end, this.payload, this.next);
+
+  Node(this.payload, this.next);
+
+  int get hash;
+}
+
+class StringNode extends Node {
+  StringNode(super.payload, super.next);
+
+  int get hash => StringCanonicalizer.hashString(payload, 0, payload.length);
+}
+
+class Utf8Node extends Node {
+  final List<int> data;
+  final int start;
+  final int end;
+
+  Utf8Node(this.data, this.start, this.end, String payload, Node? next)
+      : super(payload, next);
+
+  int get hash => StringCanonicalizer.hashBytes(data, start, end);
 }
 
 /// A hash table for triples:
@@ -41,7 +58,7 @@
     if (asciiOnly) {
       s = new String.fromCharCodes(data, start, end);
     } else {
-      s = new Utf8Decoder(allowMalformed: true).convert(data, start, end);
+      s = const Utf8Decoder(allowMalformed: true).convert(data, start, end);
     }
     return s;
   }
@@ -69,9 +86,7 @@
       Node? t = _nodes[i];
       while (t != null) {
         Node? n = t.next;
-        int newIndex = t.data is String
-            ? hashString(t.data, t.start, t.end) & (newSize - 1)
-            : hashBytes(t.data, t.start, t.end) & (newSize - 1);
+        int newIndex = t.hash & (newSize - 1);
         Node? s = newNodes[newIndex];
         t.next = s;
         newNodes[newIndex] = t;
@@ -83,38 +98,100 @@
   }
 
   String canonicalize(data, int start, int end, bool asciiOnly) {
+    if (data is String) {
+      if (start == 0 && (end == data.length - 1)) {
+        return canonicalizeString(data);
+      }
+      return canonicalizeSubString(data, start, end);
+    }
+    return canonicalizeBytes(data as List<int>, start, end, asciiOnly);
+  }
+
+  String canonicalizeBytes(List<int> data, int start, int end, bool asciiOnly) {
     if (_count > _size) rehash();
-    int index = data is String
-        ? hashString(data, start, end)
-        : hashBytes(data, start, end);
-    index = index & (_size - 1);
+    final int index = hashBytes(data, start, end) & (_size - 1);
     Node? s = _nodes[index];
     Node? t = s;
     int len = end - start;
     while (t != null) {
-      if (t.end - t.start == len) {
-        int i = start, j = t.start;
-        while (i < end && data[i] == t.data[j]) {
-          i++;
-          j++;
-        }
-        if (i == end) {
-          return t.payload;
+      if (t is Utf8Node) {
+        final List<int> tData = t.data;
+        if (t.end - t.start == len) {
+          int i = start, j = t.start;
+          while (i < end && data[i] == tData[j]) {
+            i++;
+            j++;
+          }
+          if (i == end) {
+            return t.payload;
+          }
         }
       }
       t = t.next;
     }
-    String payload;
-    if (data is String) {
-      payload = data.substring(start, end);
-    } else {
-      payload = decode(data, start, end, asciiOnly);
-    }
-    _nodes[index] = new Node(data, start, end, payload, s);
+    String payload = decode(data, start, end, asciiOnly);
+    _nodes[index] = new Utf8Node(data, start, end, payload, s);
     _count++;
     return payload;
   }
 
+  String canonicalizeSubString(String data, int start, int end) {
+    if (_count > _size) rehash();
+    final int index = hashString(data, start, end) & (_size - 1);
+    Node? s = _nodes[index];
+    Node? t = s;
+    int len = end - start;
+    while (t != null) {
+      if (t is StringNode) {
+        final String tData = t.payload;
+        if (identical(data, tData)) return tData;
+        if (tData.length == len) {
+          int i = start, j = 0;
+          while (i < end && data.codeUnitAt(i) == tData.codeUnitAt(j)) {
+            i++;
+            j++;
+          }
+          if (i == end) {
+            return tData;
+          }
+        }
+      }
+      t = t.next;
+    }
+    return insertStringNode(index, s, data.substring(start, end));
+  }
+
+  String canonicalizeString(String data) {
+    if (_count > _size) rehash();
+    final int index = hashString(data, 0, data.length) & (_size - 1);
+    Node? s = _nodes[index];
+    Node? t = s;
+    while (t != null) {
+      if (t is StringNode) {
+        final String tData = t.payload;
+        if (identical(data, tData)) return tData;
+        if (data == tData) return tData;
+      }
+      t = t.next;
+    }
+    return insertStringNode(index, s, data);
+  }
+
+  String insertStringNode(int index, Node? next, String value) {
+    final StringNode newNode = new StringNode(value, next);
+    _nodes[index] = newNode;
+    _count++;
+    return value;
+  }
+
+  String insertUtf8Node(int index, Node? next, List<int> buffer, int start,
+      int end, String value) {
+    final Utf8Node newNode = new Utf8Node(buffer, start, end, value, next);
+    _nodes[index] = newNode;
+    _count++;
+    return value;
+  }
+
   clear() {
     _size = INITIAL_SIZE;
     _nodes = new List<Node?>.filled(_size, /* fill = */ null);

diff --git a/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart b/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart
index 83f8484..4c869ba 100644
--- a/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart
+++ b/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart

@@ -41,8 +41,8 @@
    */
   StringTokenImpl.fromString(TokenType type, String value, int charOffset,
       {bool canonicalize: false, CommentToken? precedingComments})
-      : valueOrLazySubstring = canonicalizedString(
-            value, /* start = */ 0, value.length, canonicalize),
+      : valueOrLazySubstring =
+            canonicalize ? canonicalizedString(value) : value,
         super(type, charOffset, precedingComments);
 
   /**
@@ -56,7 +56,7 @@
     int length = end - start;
     if (length <= LAZY_THRESHOLD) {
       valueOrLazySubstring =
-          canonicalizedString(data, start, end, canonicalize);
+          canonicalizedSubString(data, start, end, canonicalize);
     } else {
       valueOrLazySubstring =
           new _LazySubstring(data, start, length, canonicalize);
@@ -89,7 +89,7 @@
       int start = valueOrLazySubstring.start;
       int end = start + (valueOrLazySubstring as _LazySubstring).length;
       if (data is String) {
-        valueOrLazySubstring = canonicalizedString(
+        valueOrLazySubstring = canonicalizedSubString(
             data, start, end, valueOrLazySubstring.boolValue);
       } else {
         valueOrLazySubstring =
@@ -107,14 +107,18 @@
 
   static final StringCanonicalizer canonicalizer = new StringCanonicalizer();
 
-  static String canonicalizedString(
+  static String canonicalizedString(String s) {
+    return canonicalizer.canonicalizeString(s);
+  }
+
+  static String canonicalizedSubString(
       String s, int start, int end, bool canonicalize) {
-    if (!canonicalize) return s;
-    return canonicalizer.canonicalize(s, start, end, /* asciiOnly = */ false);
+    if (!canonicalize) return s.substring(start, end);
+    return canonicalizer.canonicalizeSubString(s, start, end);
   }
 
   static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {
-    return canonicalizer.canonicalize(data, start, end, asciiOnly);
+    return canonicalizer.canonicalizeBytes(data, start, end, asciiOnly);
   }
 
   @override

diff --git a/pkg/front_end/test/static_types/cfe_allowed.json b/pkg/front_end/test/static_types/cfe_allowed.json
index adc8898..8be10b2 100644
--- a/pkg/front_end/test/static_types/cfe_allowed.json
+++ b/pkg/front_end/test/static_types/cfe_allowed.json

@@ -20,8 +20,5 @@
   },
   "pkg/front_end/lib/src/fasta/dill/dill_member_builder.dart": {
     "Dynamic access of 'fileUri'.": 1
-  },
-  "pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart": {
-    "Dynamic invocation of '[]'.": 2
   }
-}
+}
\ No newline at end of file
commit	9a3bcbcdb0b350bb41c60b1a9c619836ac8a7736	[log] [tgz]
author	Martin Kustermann <kustermann@google.com>	Mon Aug 15 17:06:58 2022 +0000
committer	Commit Bot <commit-bot@chromium.org>	Mon Aug 15 17:06:58 2022 +0000
tree	1dd9f20d877d3299cdecc2b5071c32deca6d7399
parent	d35332351079d10077fcacb0d820b2265119fe72 [diff]