blob: f348276cf99d23eae77691cfe23789e8b788bcd0 [file] [log] [blame]
// Copyright (c) 2018, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
import 'dart:io';
import 'dart:typed_data';
import '../src/args.dart';
import '../src/automaton_builder.dart';
import '../src/data_files.dart';
import '../src/grapheme_category_loader.dart';
import '../src/indirect_table.dart';
import '../src/shared.dart';
import '../src/string_literal_writer.dart';
import '../src/table_builder.dart';
// Generates tables used by the grapheme cluster breaking algorithm
// and a state machine used to implement the algorithm.
// The task of this tool is to take the complete table of
// grapheme cluster categories for every code point (U+0000 ... U+10FFFF)
// and build a smaller table with the same information.
//
// The approach taken is to split the large table into chunks,
// smaller chunks for data in the BMP (U+0000 ... U+FFFF)
// which have higher variation than later data,
// and larger chunks for non-BMP ("astral" planes) code points.
// This also corresponds to the split between one-UTF-16 code unit
// character and surrogate pairs, which gives us a natural
// branch in the string parsing code.
//
// Then a single table is built containing all these chunks, but where
// the chunks are allowed to overlap.
// An indirection table points to the start of each chunk
// in the larger table.
//
// Having many small chunks increases the size of the indirection table,
// and large chunks reduces the chance of chunks being completely
// equivalent.
//
// The state machines are based on the extended Grapheme Cluster breaking
// algorithm. The forward-scanning state machine is entirely regular.
// The backwards-scanning state machine needs to call out to do look-ahead
// in some cases (for example, how to combine a regional identifier
// depends on whether there is an odd or even number of
// previous regional identifiers.)
/// Print more information to stderr while generating.
///
/// Set while developing if you don't want to pass `-v` on the command line
/// every time.
/// Only affects this file when it's run directly.
const defaultVerbose = false;
/// Default location for table file.
const tableFile = 'lib/src/grapheme_clusters/table.dart';
// Best values found for current tables.
// Update if better value found when updating data files.
// (May consider benchmark performance as well as size.)
// TODO: Write out best sizes to a file after an update, and read them back
// next time, instead of hardcoding in the source file.
// Chunk sizes must be powers of 2.
const int defaultLowChunkSize = 32;
// Currently found best size.
const int defaultHighChunkSize = 256;
void main(List<String> args) async {
var flags = parseArgs(args, 'generate_tables', allowOptimize: true);
var output = flags.dryrun
? null
: flags.targetFile ?? File(path(packageRoot, tableFile));
if (output != null && !output.existsSync()) {
try {
output.createSync(recursive: true);
} catch (e) {
stderr.writeln('Cannot find or create file: ${output.path}');
stderr.writeln('Writing to stdout');
output = null;
}
}
var categories = await loadCategories(
update: flags.update,
verbose: flags.verbose,
);
generateTables(
output,
categories,
dryrun: flags.dryrun,
verbose: flags.verbose,
optimize: flags.optimize,
);
}
void generateTables(
File? output,
Uint8List table, {
bool dryrun = false,
bool optimize = false,
bool verbose = defaultVerbose,
bool acceptLicenseChange = false,
}) async {
var lowChunkSize = defaultLowChunkSize;
var highChunkSize = defaultHighChunkSize;
int optimizeTable(
IndirectTable chunkTable,
int lowChunkSize,
int highChunkSize,
) {
var index = 0;
do {
chunkTable.entries.add(TableEntry(0, index, lowChunkSize));
index += lowChunkSize;
} while (index < 0x10000);
var lowChunkCount = chunkTable.entries.length;
do {
chunkTable.entries.add(TableEntry(0, index, highChunkSize));
index += highChunkSize;
} while (index < 0x110000);
var highChunkCount = chunkTable.entries.length - lowChunkCount;
assert(
lowChunkCount * lowChunkSize + highChunkCount * highChunkSize == 0x110000,
);
assert(chunkTable.chunks.length == 1);
assert(
_validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
);
chunkifyTable(chunkTable);
assert(chunkTable.entries.length == lowChunkCount + highChunkCount);
assert(
_validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
);
combineChunkedTable(chunkTable);
assert(chunkTable.entries.length == lowChunkCount + highChunkCount);
assert(chunkTable.chunks.length == 1);
assert(
_validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
);
var size = chunkTable.chunks[0].length + chunkTable.entries.length * 2;
return size;
}
var chunkTable = IndirectTable([table.sublist(0, table.length)], []);
var size = optimizeTable(chunkTable, lowChunkSize, highChunkSize);
if (verbose) {
stderr.writeln('Default chunk size: $lowChunkSize/$highChunkSize: $size');
}
if (optimize) {
// Chunk sizes must be powers of 2.
// Smaller chunk sizes gives more smaller chunks,
// with more chance of overlap,
// but each chunks adds an entry to the index table.
for (var low in [64, 128, 32, 256]) {
for (var high in [512, 1024, 256, 2048]) {
if (low == lowChunkSize && high == highChunkSize) continue;
var newChunk = IndirectTable([table.sublist(0, table.length)], []);
var newSize = optimizeTable(newChunk, low, high);
if (verbose) {
var delta = newSize - size;
stderr.writeln(
"${size < newSize ? "Worse" : "Better"}"
' chunk size: $low/$high: $newSize '
"(${delta > 0 ? "+$delta" : delta})",
);
}
if (newSize < size) {
lowChunkSize = low;
highChunkSize = high;
chunkTable = newChunk;
size = newSize;
}
}
}
if (verbose) {
stderr.writeln('Best low chunk size: $lowChunkSize');
stderr.writeln('Best high chunk size: $highChunkSize');
stderr.writeln('Best table size: $size');
}
}
var buffer = StringBuffer();
writeHeader(buffer, [
graphemeTestData,
emojiTestData,
graphemeBreakPropertyData,
]);
buffer.writeln();
writeTables(
buffer,
chunkTable,
lowChunkSize,
highChunkSize,
verbose: verbose,
);
writeForwardAutomaton(buffer, verbose: verbose);
buffer.writeln();
writeBackwardAutomaton(buffer, verbose: verbose);
if (output == null) {
stdout.write(buffer);
} else if (!dryrun) {
output.writeAsStringSync(buffer.toString());
}
}
// -----------------------------------------------------------------------------
// Combined table writing.
void writeTables(
StringSink out,
IndirectTable table,
int lowChunkSize,
int highChunkSize, {
required bool verbose,
}) {
assert(table.chunks.length == 1);
_writeStringLiteral(out, '_data', table.chunks[0], verbose: verbose);
_writeStringLiteral(
out,
'_start',
table.entries.map((e) => e.start).toList(),
verbose: verbose,
);
_writeLookupFunction(out, '_data', '_start', lowChunkSize);
out.writeln();
_writeSurrogateLookupFunction(
out,
'_data',
'_start',
65536 ~/ lowChunkSize,
highChunkSize,
);
out.writeln();
}
void _writeStringLiteral(
StringSink out,
String name,
List<int> data, {
required bool verbose,
}) {
var prefix = 'const String $name = ';
out.write(prefix);
var writer = StringLiteralWriter(out, padding: 4, escape: _needsEscape);
writer.start(prefix.length);
var bytes = 0;
for (var i = 0; i < data.length; i++) {
var char = data[i];
writer.add(char);
bytes += char <= 0xFF ? 1 : 2;
}
writer.end();
out.write(';\n');
if (verbose) {
stderr.writeln('Writing $bytes bytes');
}
}
bool _needsEscape(int codeUnit) =>
codeUnit > 0xff || codeUnit == 0x7f || codeUnit & 0x60 == 0;
void _writeLookupFunction(
StringSink out,
String dataName,
String startName,
int chunkSize,
) {
out.write(_lookupMethod('low', dataName, startName, chunkSize));
}
void _writeSurrogateLookupFunction(
StringSink out,
String dataName,
String startName,
int startOffset,
int chunkSize,
) {
out.write(
_lookupSurrogatesMethod(
'high',
dataName,
startName,
startOffset,
chunkSize,
),
);
}
String _lookupMethod(
String name,
String dataName,
String startName,
int chunkSize,
) =>
'''
$preferInline
int $name(int codeUnit) {
var chunkStart = $startName.codeUnitAt(codeUnit >> ${chunkSize.bitLength - 1});
var index = chunkStart + (codeUnit & ${chunkSize - 1});
return $dataName.codeUnitAt(index);
}
''';
String _lookupSurrogatesMethod(
String name,
String dataName,
String startName,
int startOffset,
int chunkSize,
) {
if (chunkSize == 1024) {
return '''
$preferInline
int $name(int lead, int tail) {
var chunkStart = $startName.codeUnitAt($startOffset + (0x3ff & lead));
var index = chunkStart + (0x3ff & tail);
return $dataName.codeUnitAt(index);
}
''';
}
var shift = chunkSize.bitLength - 1;
var indexVar = chunkSize < 1024 ? 'tail' : 'offset';
return '''
$preferInline
int $name(int lead, int tail) {
var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift);
var chunkStart = $startName.codeUnitAt(offset >> $shift);
var index = chunkStart + ($indexVar & ${chunkSize - 1});
return $dataName.codeUnitAt(index);
}
''';
}
// -----------------------------------------------------------------------------
bool _validate(
Uint8List table,
IndirectTable indirectTable,
int lowChunkSize,
int highChunkSize, {
required bool verbose,
}) {
var lowChunkCount = 65536 ~/ lowChunkSize;
var lowChunkShift = lowChunkSize.bitLength - 1;
var lowChunkMask = lowChunkSize - 1;
for (var i = 0; i < 65536; i++) {
var value = table[i];
var entryIndex = i >> lowChunkShift;
var entry = indirectTable.entries[entryIndex];
var indirectValue = indirectTable.chunks[entry.chunkNumber]
[entry.start + (i & lowChunkMask)];
if (value != indirectValue) {
stderr.writeln('$entryIndex: $entry');
stderr.writeln(
'Error: ${i.toRadixString(16)} -> Expected $value,'
' was $indirectValue',
);
printIndirectTable(indirectTable);
return false;
}
}
var highChunkShift = highChunkSize.bitLength - 1;
var highChunkMask = highChunkSize - 1;
for (var i = 0x10000; i < 0x110000; i++) {
var j = i - 0x10000;
var value = table[i];
var entryIndex = lowChunkCount + (j >> highChunkShift);
var entry = indirectTable.entries[entryIndex];
var indirectValue = indirectTable.chunks[entry.chunkNumber]
[entry.start + (j & highChunkMask)];
if (value != indirectValue) {
stderr.writeln('$entryIndex: $entry');
stderr.writeln(
'Error: ${i.toRadixString(16)} -> Expected $value,'
' was $indirectValue',
);
printIndirectTable(indirectTable);
return false;
}
}
if (verbose) {
stderr.writeln('Table validation success');
}
return true;
}
void printIndirectTable(IndirectTable table) {
stderr.writeln(
"IT(chunks: ${table.chunks.map((x) => "#${x.length}")},"
' entries: ${table.entries}',
);
}