Do UTF8 encoding slightly differently when serializing kernel
Instead of creating two small-ish utf8lists for every string we encode,
do fewer bigger allocations and convert to utf8 via a (new) method that
will hopefully eventually be moved into String.
Golem says:
ia32
AstToBinaryP90 (Intel Core i5) 3.995% (0.2 noise)
AstToBinary (Intel Core i5) 3.992% (0.2 noise)
AstToBinaryP50 (Intel Core i5) 4.015% (0.2 noise)
AstToBinary (Intel Xeon) 3.793% (0.3 noise)
AstToBinaryP90 (Intel Xeon) 3.748% (0.3 noise)
AstToBinaryP50 (Intel Xeon) 3.769% (0.3 noise)
x64
AstToBinaryP90 (Intel Core i5) 4.220% (0.1 noise)
AstToBinaryP50 (Intel Xeon) 4.543% (0.3 noise)
AstToBinary (Intel Xeon) 4.614% (0.3 noise)
AstToBinaryP90 (Intel Xeon) 4.891% (0.3 noise)
AstToBinaryP50 (Intel Core i5) 5.448% (0.5 noise)
AstToBinary (Intel Core i5) 5.155% (0.5 noise)
Change-Id: I0532778d69205a7a187342d3bdfe503c660b1004
Reviewed-on: https://dart-review.googlesource.com/c/84413
Commit-Queue: Jens Johansen <jensj@google.com>
Reviewed-by: Peter von der Ahé <ahe@google.com>
diff --git a/pkg/kernel/lib/binary/ast_to_binary.dart b/pkg/kernel/lib/binary/ast_to_binary.dart
index 9143687..ad4b090 100644
--- a/pkg/kernel/lib/binary/ast_to_binary.dart
+++ b/pkg/kernel/lib/binary/ast_to_binary.dart
@@ -4,10 +4,10 @@
library kernel.ast_to_binary;
import 'dart:core' hide MapEntry;
+import 'dart:convert' show utf8;
import '../ast.dart';
import 'tag.dart';
-import 'dart:convert';
import 'dart:io' show BytesBuilder;
import 'dart:typed_data';
@@ -107,24 +107,58 @@
void writeStringTable(StringIndexer indexer) {
_binaryOffsetForStringTable = getBufferOffset();
+ // Containers for the utf8 encoded strings.
+ final List<Uint8List> data = new List<Uint8List>();
+ int totalLength = 0;
+ const int minLength = 1 << 16;
+ Uint8List buffer;
+ int index = 0;
+
// Write the end offsets.
writeUInt30(indexer.index.length);
- int endOffset = 0;
- List<List<int>> data =
- new List<List<int>>.filled(indexer.index.length, null);
- int i = 0;
- Utf8Encoder utf8Encoder = const Utf8Encoder();
for (String key in indexer.index.keys) {
- List<int> utf8Bytes = utf8Encoder.convert(key);
- data[i] = utf8Bytes;
- endOffset += utf8Bytes.length;
- writeUInt30(endOffset);
- i++;
+ if (key.isNotEmpty) {
+ int requiredMinLength = key.length;
+ int allocateMinLength = requiredMinLength * 3;
+ int newIndex;
+ while (true) {
+ if (buffer == null || index + requiredMinLength >= buffer.length) {
+ int newLength = minLength;
+ if (allocateMinLength > newLength) newLength = allocateMinLength;
+ if (buffer != null && index > 0) {
+ data.add(new Uint8List.view(buffer.buffer, 0, index));
+ }
+ index = 0;
+ buffer = new Uint8List(newLength);
+ }
+ newIndex = NotQuiteString.writeUtf8(buffer, index, key);
+ if (newIndex != -1) break;
+ requiredMinLength = allocateMinLength;
+ }
+ if (newIndex < 0) {
+ // Utf8 encoding failed.
+ if (buffer != null && index > 0) {
+ data.add(new Uint8List.view(buffer.buffer, 0, index));
+ buffer = null;
+ index = 0;
+ }
+ List<int> converted = utf8.encoder.convert(key);
+ data.add(converted);
+ totalLength += converted.length;
+ } else {
+ totalLength += newIndex - index;
+ index = newIndex;
+ }
+ }
+ writeUInt30(totalLength);
+ }
+ if (buffer != null && index > 0) {
+ data.add(Uint8List.view(buffer.buffer, 0, index));
}
// Write the UTF-8 encoded strings.
- for (var entry in data) {
- writeBytes(entry);
+ for (int i = 0; i < data.length; ++i) {
+ writeBytes(data[i]);
}
}
@@ -499,7 +533,7 @@
// Write data.
int i = 0;
- Utf8Encoder utf8Encoder = const Utf8Encoder();
+ Uint8List buffer = new Uint8List(1 << 16);
for (Uri uri in _sourceUriIndexer.index.keys) {
index[i] = getBufferOffset();
Source source = ((includeSources &&
@@ -509,7 +543,23 @@
: null) ??
new Source(<int>[], const <int>[]);
- writeByteList(utf8Encoder.convert(uri == null ? "" : "$uri"));
+ String uriAsString = uri == null ? "" : "$uri";
+ if (uriAsString.length * 3 < buffer.length) {
+ int length = NotQuiteString.writeUtf8(buffer, 0, uriAsString);
+ if (length < 0) {
+ // Utf8 encoding failed.
+ writeByteList(utf8.encoder.convert(uriAsString));
+ } else {
+ writeUInt30(length);
+ for (int j = 0; j < length; j++) {
+ writeByte(buffer[j]);
+ }
+ }
+ } else {
+ // Uncommon case with very long url.
+ writeByteList(utf8.encoder.convert(uriAsString));
+ }
+
writeByteList(source.source);
List<int> lineStarts = source.lineStarts;
writeUInt30(lineStarts.length);
@@ -2200,3 +2250,64 @@
// Nothing to do.
}
}
+
+class NotQuiteString {
+ /**
+ * Write [source] string into [target] starting at index [index].
+ *
+ * Optionally only write part of the input [source] starting at [start] and
+ * ending at [end].
+ *
+ * The output space needed is at most [source.length] * 3.
+ *
+ * Returns
+ * * Non-negative on success (the new index in [target]).
+ * * -1 when [target] doesn't have enough space. Note that [target] can be
+ * poluted starting at [index].
+ * * -2 on input error, i.e. an unpaired lead or tail surrogate.
+ */
+ static int writeUtf8(List<int> target, int index, String source,
+ [int start = 0, int end]) {
+ RangeError.checkValidIndex(index, target, null, target.length);
+ end = RangeError.checkValidRange(start, end, source.length);
+ if (start == end) return index;
+ var i = start;
+ var length = target.length;
+ do {
+ int codeUnit = source.codeUnitAt(i++);
+ while (codeUnit < 128) {
+ if (index >= length) return -1;
+ target[index++] = codeUnit;
+ if (i >= end) return index;
+ codeUnit = source.codeUnitAt(i++);
+ }
+ if (codeUnit < 0x800) {
+ index += 2;
+ if (index > length) return -1;
+ target[index - 2] = 0xC0 | (codeUnit >> 6);
+ target[index - 1] = 0x80 | (codeUnit & 0x3f);
+ } else if (codeUnit & 0xF800 != 0xD800) {
+ // Not a surrogate.
+ index += 3;
+ if (index > length) return -1;
+ target[index - 3] = 0xE0 | (codeUnit >> 12);
+ target[index - 2] = 0x80 | ((codeUnit >> 6) & 0x3f);
+ target[index - 1] = 0x80 | (codeUnit & 0x3f);
+ } else {
+ if (codeUnit >= 0xDC00) return -2; // Unpaired tail surrogate.
+ if (i >= end) return -2; // Unpaired lead surrogate.
+ int nextChar = source.codeUnitAt(i++);
+ if (nextChar & 0xFC00 != 0xDC00) return -2; // Unpaired lead surrogate.
+ index += 4;
+ if (index > length) return -1;
+ codeUnit = (codeUnit & 0x3FF) + 0x40;
+ target[index - 4] = 0xF0 | (codeUnit >> 8);
+ target[index - 3] = 0x80 | ((codeUnit >> 2) & 0x3F);
+ target[index - 2] =
+ 0x80 | (((codeUnit & 3) << 4) | ((nextChar & 0x3FF) >> 6));
+ target[index - 1] = 0x80 | (nextChar & 0x3f);
+ }
+ } while (i < end);
+ return index;
+ }
+}