[CFE] Improve [StringCanonicalizer] implementation
The current [StringCanonicalizer] implementation has some issues:
* It hangs on to large [Uint8List]/[String] objects in it's cache
=> This requires users (such as analyzer) to clear the cache
frequently
* Has api that works on dynamic input (which is assumed to be String
or List<int> / Uint8List)
=> Call sites have typing information we loose when doing the call
* It uses dynamic [] calls to compare input bytes with cached bytes /
input strings with cached strings
=> Dynamic calls come with overhead
=> Will cause substring generation for every character comparison
=> Will compare bytes with strings (which doesn't make sense)
To address these issues we
* Use the canonicalized [String] to compare against instead of the
(much larger) source strings, thereby no longer hanging on to large
strings in the canonicalizer cache (it's still an issue with
[Uint8List]s though)
* Make seperate API for canonicalization of strings, sub-strings or
sub-utf8-bytes and use it from the token implementation.
* For canonicalization of strings use String.== (instead of
char-by-char comparison)
* For canonicalization of sub-strings use String.charCodeAt instead of
[] (which creates substrings)
* Seperate out cache node entries into two classes and reduce memory
consumption of the nodes that represent strings by 16 bytes (it
does an additional `is` check on lookups in the cache, but that is
better than paying for dynamic calls on the payload - which
causes the compiler to do implicit checks)
=> This CL reduces RAM consumption and makes CFE scan/scan_bytes benchmarks a little faster.
TEST=ci
Change-Id: I157c298d26d25ac5da82c32eedfa270a590156f0
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/255121
Commit-Queue: Martin Kustermann <kustermann@google.com>
Reviewed-by: Jens Johansen <jensj@google.com>
diff --git a/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart b/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart
index 966e811..68506c0 100644
--- a/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart
+++ b/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart
@@ -6,13 +6,30 @@
import 'dart:convert';
-class Node {
- dynamic /* String | List<int> */ data;
- int start;
- int end;
- String payload;
+abstract class Node {
+ final String payload;
Node? next;
- Node(this.data, this.start, this.end, this.payload, this.next);
+
+ Node(this.payload, this.next);
+
+ int get hash;
+}
+
+class StringNode extends Node {
+ StringNode(super.payload, super.next);
+
+ int get hash => StringCanonicalizer.hashString(payload, 0, payload.length);
+}
+
+class Utf8Node extends Node {
+ final List<int> data;
+ final int start;
+ final int end;
+
+ Utf8Node(this.data, this.start, this.end, String payload, Node? next)
+ : super(payload, next);
+
+ int get hash => StringCanonicalizer.hashBytes(data, start, end);
}
/// A hash table for triples:
@@ -41,7 +58,7 @@
if (asciiOnly) {
s = new String.fromCharCodes(data, start, end);
} else {
- s = new Utf8Decoder(allowMalformed: true).convert(data, start, end);
+ s = const Utf8Decoder(allowMalformed: true).convert(data, start, end);
}
return s;
}
@@ -69,9 +86,7 @@
Node? t = _nodes[i];
while (t != null) {
Node? n = t.next;
- int newIndex = t.data is String
- ? hashString(t.data, t.start, t.end) & (newSize - 1)
- : hashBytes(t.data, t.start, t.end) & (newSize - 1);
+ int newIndex = t.hash & (newSize - 1);
Node? s = newNodes[newIndex];
t.next = s;
newNodes[newIndex] = t;
@@ -83,38 +98,100 @@
}
String canonicalize(data, int start, int end, bool asciiOnly) {
+ if (data is String) {
+ if (start == 0 && (end == data.length - 1)) {
+ return canonicalizeString(data);
+ }
+ return canonicalizeSubString(data, start, end);
+ }
+ return canonicalizeBytes(data as List<int>, start, end, asciiOnly);
+ }
+
+ String canonicalizeBytes(List<int> data, int start, int end, bool asciiOnly) {
if (_count > _size) rehash();
- int index = data is String
- ? hashString(data, start, end)
- : hashBytes(data, start, end);
- index = index & (_size - 1);
+ final int index = hashBytes(data, start, end) & (_size - 1);
Node? s = _nodes[index];
Node? t = s;
int len = end - start;
while (t != null) {
- if (t.end - t.start == len) {
- int i = start, j = t.start;
- while (i < end && data[i] == t.data[j]) {
- i++;
- j++;
- }
- if (i == end) {
- return t.payload;
+ if (t is Utf8Node) {
+ final List<int> tData = t.data;
+ if (t.end - t.start == len) {
+ int i = start, j = t.start;
+ while (i < end && data[i] == tData[j]) {
+ i++;
+ j++;
+ }
+ if (i == end) {
+ return t.payload;
+ }
}
}
t = t.next;
}
- String payload;
- if (data is String) {
- payload = data.substring(start, end);
- } else {
- payload = decode(data, start, end, asciiOnly);
- }
- _nodes[index] = new Node(data, start, end, payload, s);
+ String payload = decode(data, start, end, asciiOnly);
+ _nodes[index] = new Utf8Node(data, start, end, payload, s);
_count++;
return payload;
}
+ String canonicalizeSubString(String data, int start, int end) {
+ if (_count > _size) rehash();
+ final int index = hashString(data, start, end) & (_size - 1);
+ Node? s = _nodes[index];
+ Node? t = s;
+ int len = end - start;
+ while (t != null) {
+ if (t is StringNode) {
+ final String tData = t.payload;
+ if (identical(data, tData)) return tData;
+ if (tData.length == len) {
+ int i = start, j = 0;
+ while (i < end && data.codeUnitAt(i) == tData.codeUnitAt(j)) {
+ i++;
+ j++;
+ }
+ if (i == end) {
+ return tData;
+ }
+ }
+ }
+ t = t.next;
+ }
+ return insertStringNode(index, s, data.substring(start, end));
+ }
+
+ String canonicalizeString(String data) {
+ if (_count > _size) rehash();
+ final int index = hashString(data, 0, data.length) & (_size - 1);
+ Node? s = _nodes[index];
+ Node? t = s;
+ while (t != null) {
+ if (t is StringNode) {
+ final String tData = t.payload;
+ if (identical(data, tData)) return tData;
+ if (data == tData) return tData;
+ }
+ t = t.next;
+ }
+ return insertStringNode(index, s, data);
+ }
+
+ String insertStringNode(int index, Node? next, String value) {
+ final StringNode newNode = new StringNode(value, next);
+ _nodes[index] = newNode;
+ _count++;
+ return value;
+ }
+
+ String insertUtf8Node(int index, Node? next, List<int> buffer, int start,
+ int end, String value) {
+ final Utf8Node newNode = new Utf8Node(buffer, start, end, value, next);
+ _nodes[index] = newNode;
+ _count++;
+ return value;
+ }
+
clear() {
_size = INITIAL_SIZE;
_nodes = new List<Node?>.filled(_size, /* fill = */ null);
diff --git a/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart b/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart
index 83f8484..4c869ba 100644
--- a/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart
+++ b/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart
@@ -41,8 +41,8 @@
*/
StringTokenImpl.fromString(TokenType type, String value, int charOffset,
{bool canonicalize: false, CommentToken? precedingComments})
- : valueOrLazySubstring = canonicalizedString(
- value, /* start = */ 0, value.length, canonicalize),
+ : valueOrLazySubstring =
+ canonicalize ? canonicalizedString(value) : value,
super(type, charOffset, precedingComments);
/**
@@ -56,7 +56,7 @@
int length = end - start;
if (length <= LAZY_THRESHOLD) {
valueOrLazySubstring =
- canonicalizedString(data, start, end, canonicalize);
+ canonicalizedSubString(data, start, end, canonicalize);
} else {
valueOrLazySubstring =
new _LazySubstring(data, start, length, canonicalize);
@@ -89,7 +89,7 @@
int start = valueOrLazySubstring.start;
int end = start + (valueOrLazySubstring as _LazySubstring).length;
if (data is String) {
- valueOrLazySubstring = canonicalizedString(
+ valueOrLazySubstring = canonicalizedSubString(
data, start, end, valueOrLazySubstring.boolValue);
} else {
valueOrLazySubstring =
@@ -107,14 +107,18 @@
static final StringCanonicalizer canonicalizer = new StringCanonicalizer();
- static String canonicalizedString(
+ static String canonicalizedString(String s) {
+ return canonicalizer.canonicalizeString(s);
+ }
+
+ static String canonicalizedSubString(
String s, int start, int end, bool canonicalize) {
- if (!canonicalize) return s;
- return canonicalizer.canonicalize(s, start, end, /* asciiOnly = */ false);
+ if (!canonicalize) return s.substring(start, end);
+ return canonicalizer.canonicalizeSubString(s, start, end);
}
static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {
- return canonicalizer.canonicalize(data, start, end, asciiOnly);
+ return canonicalizer.canonicalizeBytes(data, start, end, asciiOnly);
}
@override
diff --git a/pkg/front_end/test/static_types/cfe_allowed.json b/pkg/front_end/test/static_types/cfe_allowed.json
index adc8898..8be10b2 100644
--- a/pkg/front_end/test/static_types/cfe_allowed.json
+++ b/pkg/front_end/test/static_types/cfe_allowed.json
@@ -20,8 +20,5 @@
},
"pkg/front_end/lib/src/fasta/dill/dill_member_builder.dart": {
"Dynamic access of 'fileUri'.": 1
- },
- "pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart": {
- "Dynamic invocation of '[]'.": 2
}
-}
+}
\ No newline at end of file