| // Copyright (c) 2025, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| import 'dart:typed_data'; |
| |
| import 'package:_fe_analyzer_shared/src/scanner/string_canonicalizer.dart'; |
| import 'package:analyzer/src/binary/binary_writer.dart'; |
| |
| class StringTable { |
| final Uint8List _bytes; |
| int _byteOffset; |
| |
| late final Uint32List _offsets; |
| late final List<String?> _strings; |
| |
| /// The structure of the table: |
| /// - `<bytes with encoded strings>` |
| /// - `<the length of the bytes> <-- [startOffset]` |
| /// - `<the number strings>` |
| /// - `<the array of lengths of individual strings>` |
| StringTable({required Uint8List bytes, required int startOffset}) |
| : _bytes = bytes, |
| _byteOffset = startOffset { |
| var offset = startOffset - _readUint30(); |
| var length = _readUint30(); |
| |
| _offsets = Uint32List(length + 1); |
| for (var i = 0; i < length; i++) { |
| var stringLength = _readUint30(); |
| _offsets[i] = offset; |
| offset += stringLength; |
| } |
| _offsets[length] = offset; |
| |
| _strings = List.filled(length, null); |
| } |
| |
| String operator [](int index) { |
| var result = _strings[index]; |
| |
| if (result == null) { |
| int start = _offsets[index]; |
| int end = _offsets[index + 1]; |
| int length = end - start; |
| result = _readStringEntry(_offsets[index], length); |
| result = considerCanonicalizeString(result); |
| _strings[index] = result; |
| } |
| |
| return result; |
| } |
| |
| int _readByte() { |
| return _bytes[_byteOffset++]; |
| } |
| |
| String _readStringEntry(int start, int numBytes) { |
| var end = start + numBytes; |
| for (var i = start; i < end; i++) { |
| if (_bytes[i] > 127) { |
| return _decodeWtf8(_bytes, start, end); |
| } |
| } |
| return String.fromCharCodes(_bytes, start, end); |
| } |
| |
| int _readUint30() { |
| var byte = _readByte(); |
| if (byte & 0x80 == 0) { |
| // 0xxxxxxx |
| return byte; |
| } else if (byte & 0x40 == 0) { |
| // 10xxxxxx |
| return ((byte & 0x3F) << 8) | _readByte(); |
| } else { |
| // 11xxxxxx |
| return ((byte & 0x3F) << 24) | |
| (_readByte() << 16) | |
| (_readByte() << 8) | |
| _readByte(); |
| } |
| } |
| |
| static String _decodeWtf8(Uint8List bytes, int start, int end) { |
| // WTF-8 decoder that trusts its input, meaning that the correctness of |
| // the code depends on the bytes from start to end being valid and |
| // complete WTF-8. Instead of masking off the control bits from every |
| // byte, it simply xor's the byte values together at their appropriate |
| // bit shifts, and then xor's out all of the control bits at once. |
| Uint16List charCodes = Uint16List(end - start); |
| int i = start; |
| int j = 0; |
| while (i < end) { |
| int byte = bytes[i++]; |
| if (byte < 0x80) { |
| // ASCII. |
| charCodes[j++] = byte; |
| } else if (byte < 0xE0) { |
| // Two-byte sequence (11-bit unicode value). |
| int byte2 = bytes[i++]; |
| int value = (byte << 6) ^ byte2 ^ 0x3080; |
| assert(value >= 0x80 && value < 0x800); |
| charCodes[j++] = value; |
| } else if (byte < 0xF0) { |
| // Three-byte sequence (16-bit unicode value). |
| int byte2 = bytes[i++]; |
| int byte3 = bytes[i++]; |
| int value = (byte << 12) ^ (byte2 << 6) ^ byte3 ^ 0xE2080; |
| assert(value >= 0x800 && value < 0x10000); |
| charCodes[j++] = value; |
| } else { |
| // Four-byte sequence (non-BMP unicode value). |
| int byte2 = bytes[i++]; |
| int byte3 = bytes[i++]; |
| int byte4 = bytes[i++]; |
| int value = |
| (byte << 18) ^ (byte2 << 12) ^ (byte3 << 6) ^ byte4 ^ 0x3C82080; |
| assert(value >= 0x10000 && value < 0x110000); |
| charCodes[j++] = 0xD7C0 + (value >> 10); |
| charCodes[j++] = 0xDC00 + (value & 0x3FF); |
| } |
| } |
| assert(i == end); |
| return String.fromCharCodes(charCodes, 0, j); |
| } |
| } |
| |
| class StringTableBuilder { |
| final Map<String, int> _index = {}; |
| |
| int operator [](String string) { |
| var result = _index[string]; |
| |
| if (result == null) { |
| result = _index.length; |
| _index[string] = result; |
| } |
| |
| return result; |
| } |
| |
| int write(BinaryWriter writer) { |
| var bytesOffset = writer.offset; |
| |
| var length = _index.length; |
| var lengths = Uint32List(length); |
| var lengthsIndex = 0; |
| for (var key in _index.keys) { |
| var stringStart = writer.offset; |
| _writeWtf8(writer, key); |
| lengths[lengthsIndex++] = writer.offset - stringStart; |
| } |
| |
| var resultOffset = writer.offset; |
| |
| var lengthOfBytes = writer.offset - bytesOffset; |
| writer.writeUint30(lengthOfBytes); |
| writer.writeUint30List(lengths); |
| |
| return resultOffset; |
| } |
| |
| /// Write [source] string into [writer]. |
| static void _writeWtf8(BinaryWriter writer, String source) { |
| var end = source.length; |
| if (end == 0) { |
| return; |
| } |
| |
| int i = 0; |
| do { |
| var codeUnit = source.codeUnitAt(i++); |
| if (codeUnit < 128) { |
| // ASCII. |
| writer.writeByte(codeUnit); |
| } else if (codeUnit < 0x800) { |
| // Two-byte sequence (11-bit unicode value). |
| writer.writeByte(0xC0 | (codeUnit >> 6)); |
| writer.writeByte(0x80 | (codeUnit & 0x3f)); |
| } else if ((codeUnit & 0xFC00) == 0xD800 && |
| i < end && |
| (source.codeUnitAt(i) & 0xFC00) == 0xDC00) { |
| // Surrogate pair -> four-byte sequence (non-BMP unicode value). |
| int codeUnit2 = source.codeUnitAt(i++); |
| int unicode = |
| 0x10000 + ((codeUnit & 0x3FF) << 10) + (codeUnit2 & 0x3FF); |
| writer.writeByte(0xF0 | (unicode >> 18)); |
| writer.writeByte(0x80 | ((unicode >> 12) & 0x3F)); |
| writer.writeByte(0x80 | ((unicode >> 6) & 0x3F)); |
| writer.writeByte(0x80 | (unicode & 0x3F)); |
| } else { |
| // Three-byte sequence (16-bit unicode value), including lone |
| // surrogates. |
| writer.writeByte(0xE0 | (codeUnit >> 12)); |
| writer.writeByte(0x80 | ((codeUnit >> 6) & 0x3f)); |
| writer.writeByte(0x80 | (codeUnit & 0x3f)); |
| } |
| } while (i < end); |
| } |
| } |