pkgs/characters/tool/bin/generate_tables.dart - core.git - Git at Google

 // Copyright (c) 2018, the Dart project authors.  Please see the AUTHORS file
 // for details. All rights reserved. Use of this source code is governed by a
 // BSD-style license that can be found in the LICENSE file.

 import 'dart:io';
 import 'dart:typed_data';

 import '../src/args.dart';
 import '../src/automaton_builder.dart';
 import '../src/data_files.dart';
 import '../src/grapheme_category_loader.dart';
 import '../src/indirect_table.dart';
 import '../src/shared.dart';
 import '../src/string_literal_writer.dart';
 import '../src/table_builder.dart';

 // Generates tables used by the grapheme cluster breaking algorithm
 // and a state machine used to implement the algorithm.

 // The task of this tool is to take the complete table of
 // grapheme cluster categories for every code point (U+0000 ... U+10FFFF)
 // and build a smaller table with the same information.
 //
 // The approach taken is to split the large table into chunks,
 // smaller chunks for data in the BMP (U+0000 ... U+FFFF)
 // which have higher variation than later data,
 // and larger chunks for non-BMP ("astral" planes) code points.
 // This also corresponds to the split between one-UTF-16 code unit
 // character and surrogate pairs, which gives us a natural
 // branch in the string parsing code.
 //
 // Then a single table is built containing all these chunks, but where
 // the chunks are allowed to overlap.
 // An indirection table points to the start of each chunk
 // in the larger table.
 //
 // Having many small chunks increases the size of the indirection table,
 // and large chunks reduces the chance of chunks being completely
 // equivalent.
 //
 // The state machines are based on the extended Grapheme Cluster breaking
 // algorithm. The forward-scanning state machine is entirely regular.
 // The backwards-scanning state machine needs to call out to do look-ahead
 // in some cases (for example, how to combine a regional identifier
 // depends on whether there is an odd or even number of
 // previous regional identifiers.)

 /// Print more information to stderr while generating.
 ///
 /// Set while developing if you don't want to pass `-v` on the command line
 /// every time.
 /// Only affects this file when it's run directly.
 const defaultVerbose = false;

 /// Default location for table file.
 const tableFile = 'lib/src/grapheme_clusters/table.dart';

 // Best values found for current tables.
 // Update if better value found when updating data files.
 // (May consider benchmark performance as well as size.)

 // TODO: Write out best sizes to a file after an update, and read them back
 // next time, instead of hardcoding in the source file.

 // Chunk sizes must be powers of 2.
 const int defaultLowChunkSize = 32;

 // Currently found best size.
 const int defaultHighChunkSize = 256;

 void main(List<String> args) async {
   var flags = parseArgs(args, 'generate_tables', allowOptimize: true);
   var output = flags.dryrun
       ? null
       : flags.targetFile ?? File(path(packageRoot, tableFile));

   if (output != null && !output.existsSync()) {
     try {
       output.createSync(recursive: true);
     } catch (e) {
       stderr.writeln('Cannot find or create file: ${output.path}');
       stderr.writeln('Writing to stdout');
       output = null;
     }
   }

   var categories = await loadCategories(
     update: flags.update,
     verbose: flags.verbose,
   );

   generateTables(
     output,
     categories,
     dryrun: flags.dryrun,
     verbose: flags.verbose,
     optimize: flags.optimize,
   );
 }

 void generateTables(
   File? output,
   Uint8List table, {
   bool dryrun = false,
   bool optimize = false,
   bool verbose = defaultVerbose,
   bool acceptLicenseChange = false,
 }) async {
   var lowChunkSize = defaultLowChunkSize;
   var highChunkSize = defaultHighChunkSize;

   int optimizeTable(
     IndirectTable chunkTable,
     int lowChunkSize,
     int highChunkSize,
   ) {
     var index = 0;
     do {
       chunkTable.entries.add(TableEntry(0, index, lowChunkSize));
       index += lowChunkSize;
     } while (index < 0x10000);
     var lowChunkCount = chunkTable.entries.length;
     do {
       chunkTable.entries.add(TableEntry(0, index, highChunkSize));
       index += highChunkSize;
     } while (index < 0x110000);
     var highChunkCount = chunkTable.entries.length - lowChunkCount;
     assert(
       lowChunkCount * lowChunkSize + highChunkCount * highChunkSize == 0x110000,
     );
     assert(chunkTable.chunks.length == 1);
     assert(
       _validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
     );

     chunkifyTable(chunkTable);
     assert(chunkTable.entries.length == lowChunkCount + highChunkCount);
     assert(
       _validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
     );

     combineChunkedTable(chunkTable);
     assert(chunkTable.entries.length == lowChunkCount + highChunkCount);
     assert(chunkTable.chunks.length == 1);
     assert(
       _validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
     );

     var size = chunkTable.chunks[0].length + chunkTable.entries.length * 2;
     return size;
   }

   var chunkTable = IndirectTable([table.sublist(0, table.length)], []);
   var size = optimizeTable(chunkTable, lowChunkSize, highChunkSize);
   if (verbose) {
     stderr.writeln('Default chunk size: $lowChunkSize/$highChunkSize: $size');
   }
   if (optimize) {
     // Chunk sizes must be powers of 2.
     // Smaller chunk sizes gives more smaller chunks,
     // with more chance of overlap,
     // but each chunks adds an entry to the index table.
     for (var low in [64, 128, 32, 256]) {
       for (var high in [512, 1024, 256, 2048]) {
         if (low == lowChunkSize && high == highChunkSize) continue;
         var newChunk = IndirectTable([table.sublist(0, table.length)], []);
         var newSize = optimizeTable(newChunk, low, high);
         if (verbose) {
           var delta = newSize - size;
           stderr.writeln(
             "${size < newSize ? "Worse" : "Better"}"
             ' chunk size: $low/$high: $newSize '
             "(${delta > 0 ? "+$delta" : delta})",
           );
         }
         if (newSize < size) {
           lowChunkSize = low;
           highChunkSize = high;
           chunkTable = newChunk;
           size = newSize;
         }
       }
     }
     if (verbose) {
       stderr.writeln('Best low chunk size: $lowChunkSize');
       stderr.writeln('Best high chunk size: $highChunkSize');
       stderr.writeln('Best table size: $size');
     }
   }

   var buffer = StringBuffer();
   writeHeader(buffer, [
     graphemeTestData,
     emojiTestData,
     graphemeBreakPropertyData,
   ]);
   buffer.writeln();

   writeTables(
     buffer,
     chunkTable,
     lowChunkSize,
     highChunkSize,
     verbose: verbose,
   );

   writeForwardAutomaton(buffer, verbose: verbose);
   buffer.writeln();

   writeBackwardAutomaton(buffer, verbose: verbose);

   if (output == null) {
     stdout.write(buffer);
   } else if (!dryrun) {
     output.writeAsStringSync(buffer.toString());
   }
 }

 // -----------------------------------------------------------------------------
 // Combined table writing.
 void writeTables(
   StringSink out,
   IndirectTable table,
   int lowChunkSize,
   int highChunkSize, {
   required bool verbose,
 }) {
   assert(table.chunks.length == 1);
   _writeStringLiteral(out, '_data', table.chunks[0], verbose: verbose);
   _writeStringLiteral(
     out,
     '_start',
     table.entries.map((e) => e.start).toList(),
     verbose: verbose,
   );
   _writeLookupFunction(out, '_data', '_start', lowChunkSize);
   out.writeln();
   _writeSurrogateLookupFunction(
     out,
     '_data',
     '_start',
     65536 ~/ lowChunkSize,
     highChunkSize,
   );
   out.writeln();
 }

 void _writeStringLiteral(
   StringSink out,
   String name,
   List<int> data, {
   required bool verbose,
 }) {
   var prefix = 'const String $name = ';
   out.write(prefix);
   var writer = StringLiteralWriter(out, padding: 4, escape: _needsEscape);
   writer.start(prefix.length);
   var bytes = 0;
   for (var i = 0; i < data.length; i++) {
     var char = data[i];
     writer.add(char);
     bytes += char <= 0xFF ? 1 : 2;
   }
   writer.end();
   out.write(';\n');
   if (verbose) {
     stderr.writeln('Writing $bytes bytes');
   }
 }

 bool _needsEscape(int codeUnit) =>
     codeUnit > 0xff || codeUnit == 0x7f || codeUnit & 0x60 == 0;

 void _writeLookupFunction(
   StringSink out,
   String dataName,
   String startName,
   int chunkSize,
 ) {
   out.write(_lookupMethod('low', dataName, startName, chunkSize));
 }

 void _writeSurrogateLookupFunction(
   StringSink out,
   String dataName,
   String startName,
   int startOffset,
   int chunkSize,
 ) {
   out.write(
     _lookupSurrogatesMethod(
       'high',
       dataName,
       startName,
       startOffset,
       chunkSize,
     ),
   );
 }

 String _lookupMethod(
   String name,
   String dataName,
   String startName,
   int chunkSize,
 ) =>
     '''
 $preferInline
 int $name(int codeUnit) {
   var chunkStart = $startName.codeUnitAt(codeUnit >> ${chunkSize.bitLength - 1});
   var index = chunkStart + (codeUnit & ${chunkSize - 1});
   return $dataName.codeUnitAt(index);
 }
 ''';

 String _lookupSurrogatesMethod(
   String name,
   String dataName,
   String startName,
   int startOffset,
   int chunkSize,
 ) {
   if (chunkSize == 1024) {
     return '''
 $preferInline
 int $name(int lead, int tail) {
   var chunkStart = $startName.codeUnitAt($startOffset + (0x3ff & lead));
   var index = chunkStart + (0x3ff & tail);
   return $dataName.codeUnitAt(index);
 }
 ''';
   }
   var shift = chunkSize.bitLength - 1;
   var indexVar = chunkSize < 1024 ? 'tail' : 'offset';
   return '''
 $preferInline
 int $name(int lead, int tail) {
   var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift);
   var chunkStart = $startName.codeUnitAt(offset >> $shift);
   var index = chunkStart + ($indexVar & ${chunkSize - 1});
   return $dataName.codeUnitAt(index);
 }
 ''';
 }

 // -----------------------------------------------------------------------------
 bool _validate(
   Uint8List table,
   IndirectTable indirectTable,
   int lowChunkSize,
   int highChunkSize, {
   required bool verbose,
 }) {
   var lowChunkCount = 65536 ~/ lowChunkSize;
   var lowChunkShift = lowChunkSize.bitLength - 1;
   var lowChunkMask = lowChunkSize - 1;
   for (var i = 0; i < 65536; i++) {
     var value = table[i];
     var entryIndex = i >> lowChunkShift;
     var entry = indirectTable.entries[entryIndex];
     var indirectValue = indirectTable.chunks[entry.chunkNumber]
         [entry.start + (i & lowChunkMask)];
     if (value != indirectValue) {
       stderr.writeln('$entryIndex: $entry');
       stderr.writeln(
         'Error: ${i.toRadixString(16)} -> Expected $value,'
         ' was $indirectValue',
       );
       printIndirectTable(indirectTable);
       return false;
     }
   }
   var highChunkShift = highChunkSize.bitLength - 1;
   var highChunkMask = highChunkSize - 1;
   for (var i = 0x10000; i < 0x110000; i++) {
     var j = i - 0x10000;
     var value = table[i];
     var entryIndex = lowChunkCount + (j >> highChunkShift);
     var entry = indirectTable.entries[entryIndex];
     var indirectValue = indirectTable.chunks[entry.chunkNumber]
         [entry.start + (j & highChunkMask)];
     if (value != indirectValue) {
       stderr.writeln('$entryIndex: $entry');
       stderr.writeln(
         'Error: ${i.toRadixString(16)} -> Expected $value,'
         ' was $indirectValue',
       );
       printIndirectTable(indirectTable);
       return false;
     }
   }
   if (verbose) {
     stderr.writeln('Table validation success');
   }
   return true;
 }

 void printIndirectTable(IndirectTable table) {
   stderr.writeln(
     "IT(chunks: ${table.chunks.map((x) => "#${x.length}")},"
     ' entries: ${table.entries}',
   );
 }
	// Copyright (c) 2018, the Dart project authors. Please see the AUTHORS file
	// for details. All rights reserved. Use of this source code is governed by a
	// BSD-style license that can be found in the LICENSE file.

	import 'dart:io';
	import 'dart:typed_data';

	import '../src/args.dart';
	import '../src/automaton_builder.dart';
	import '../src/data_files.dart';
	import '../src/grapheme_category_loader.dart';
	import '../src/indirect_table.dart';
	import '../src/shared.dart';
	import '../src/string_literal_writer.dart';
	import '../src/table_builder.dart';

	// Generates tables used by the grapheme cluster breaking algorithm
	// and a state machine used to implement the algorithm.

	// The task of this tool is to take the complete table of
	// grapheme cluster categories for every code point (U+0000 ... U+10FFFF)
	// and build a smaller table with the same information.
	//
	// The approach taken is to split the large table into chunks,
	// smaller chunks for data in the BMP (U+0000 ... U+FFFF)
	// which have higher variation than later data,
	// and larger chunks for non-BMP ("astral" planes) code points.
	// This also corresponds to the split between one-UTF-16 code unit
	// character and surrogate pairs, which gives us a natural
	// branch in the string parsing code.
	//
	// Then a single table is built containing all these chunks, but where
	// the chunks are allowed to overlap.
	// An indirection table points to the start of each chunk
	// in the larger table.
	//
	// Having many small chunks increases the size of the indirection table,
	// and large chunks reduces the chance of chunks being completely
	// equivalent.
	//
	// The state machines are based on the extended Grapheme Cluster breaking
	// algorithm. The forward-scanning state machine is entirely regular.
	// The backwards-scanning state machine needs to call out to do look-ahead
	// in some cases (for example, how to combine a regional identifier
	// depends on whether there is an odd or even number of
	// previous regional identifiers.)

	/// Print more information to stderr while generating.
	///
	/// Set while developing if you don't want to pass `-v` on the command line
	/// every time.
	/// Only affects this file when it's run directly.
	const defaultVerbose = false;

	/// Default location for table file.
	const tableFile = 'lib/src/grapheme_clusters/table.dart';

	// Best values found for current tables.
	// Update if better value found when updating data files.
	// (May consider benchmark performance as well as size.)

	// TODO: Write out best sizes to a file after an update, and read them back
	// next time, instead of hardcoding in the source file.

	// Chunk sizes must be powers of 2.
	const int defaultLowChunkSize = 32;

	// Currently found best size.
	const int defaultHighChunkSize = 256;

	void main(List<String> args) async {
	var flags = parseArgs(args, 'generate_tables', allowOptimize: true);
	var output = flags.dryrun
	? null
	: flags.targetFile ?? File(path(packageRoot, tableFile));

	if (output != null && !output.existsSync()) {
	try {
	output.createSync(recursive: true);
	} catch (e) {
	stderr.writeln('Cannot find or create file: ${output.path}');
	stderr.writeln('Writing to stdout');
	output = null;
	}
	}

	var categories = await loadCategories(
	update: flags.update,
	verbose: flags.verbose,
	);

	generateTables(
	output,
	categories,
	dryrun: flags.dryrun,
	verbose: flags.verbose,
	optimize: flags.optimize,
	);
	}

	void generateTables(
	File? output,
	Uint8List table, {
	bool dryrun = false,
	bool optimize = false,
	bool verbose = defaultVerbose,
	bool acceptLicenseChange = false,
	}) async {
	var lowChunkSize = defaultLowChunkSize;
	var highChunkSize = defaultHighChunkSize;

	int optimizeTable(
	IndirectTable chunkTable,
	int lowChunkSize,
	int highChunkSize,
	) {
	var index = 0;
	do {
	chunkTable.entries.add(TableEntry(0, index, lowChunkSize));
	index += lowChunkSize;
	} while (index < 0x10000);
	var lowChunkCount = chunkTable.entries.length;
	do {
	chunkTable.entries.add(TableEntry(0, index, highChunkSize));
	index += highChunkSize;
	} while (index < 0x110000);
	var highChunkCount = chunkTable.entries.length - lowChunkCount;
	assert(
	lowChunkCount * lowChunkSize + highChunkCount * highChunkSize == 0x110000,
	);
	assert(chunkTable.chunks.length == 1);
	assert(
	_validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
	);

	chunkifyTable(chunkTable);
	assert(chunkTable.entries.length == lowChunkCount + highChunkCount);
	assert(
	_validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
	);

	combineChunkedTable(chunkTable);
	assert(chunkTable.entries.length == lowChunkCount + highChunkCount);
	assert(chunkTable.chunks.length == 1);
	assert(
	_validate(table, chunkTable, lowChunkSize, highChunkSize, verbose: false),
	);

	var size = chunkTable.chunks[0].length + chunkTable.entries.length * 2;
	return size;
	}

	var chunkTable = IndirectTable([table.sublist(0, table.length)], []);
	var size = optimizeTable(chunkTable, lowChunkSize, highChunkSize);
	if (verbose) {
	stderr.writeln('Default chunk size: $lowChunkSize/$highChunkSize: $size');
	}
	if (optimize) {
	// Chunk sizes must be powers of 2.
	// Smaller chunk sizes gives more smaller chunks,
	// with more chance of overlap,
	// but each chunks adds an entry to the index table.
	for (var low in [64, 128, 32, 256]) {
	for (var high in [512, 1024, 256, 2048]) {
	if (low == lowChunkSize && high == highChunkSize) continue;
	var newChunk = IndirectTable([table.sublist(0, table.length)], []);
	var newSize = optimizeTable(newChunk, low, high);
	if (verbose) {
	var delta = newSize - size;
	stderr.writeln(
	"${size < newSize ? "Worse" : "Better"}"
	' chunk size: $low/$high: $newSize '
	"(${delta > 0 ? "+$delta" : delta})",
	);
	}
	if (newSize < size) {
	lowChunkSize = low;
	highChunkSize = high;
	chunkTable = newChunk;
	size = newSize;
	}
	}
	}
	if (verbose) {
	stderr.writeln('Best low chunk size: $lowChunkSize');
	stderr.writeln('Best high chunk size: $highChunkSize');
	stderr.writeln('Best table size: $size');
	}
	}

	var buffer = StringBuffer();
	writeHeader(buffer, [
	graphemeTestData,
	emojiTestData,
	graphemeBreakPropertyData,
	]);
	buffer.writeln();

	writeTables(
	buffer,
	chunkTable,
	lowChunkSize,
	highChunkSize,
	verbose: verbose,
	);

	writeForwardAutomaton(buffer, verbose: verbose);
	buffer.writeln();

	writeBackwardAutomaton(buffer, verbose: verbose);

	if (output == null) {
	stdout.write(buffer);
	} else if (!dryrun) {
	output.writeAsStringSync(buffer.toString());
	}
	}

	// -----------------------------------------------------------------------------
	// Combined table writing.
	void writeTables(
	StringSink out,
	IndirectTable table,
	int lowChunkSize,
	int highChunkSize, {
	required bool verbose,
	}) {
	assert(table.chunks.length == 1);
	_writeStringLiteral(out, '_data', table.chunks[0], verbose: verbose);
	_writeStringLiteral(
	out,
	'_start',
	table.entries.map((e) => e.start).toList(),
	verbose: verbose,
	);
	_writeLookupFunction(out, '_data', '_start', lowChunkSize);
	out.writeln();
	_writeSurrogateLookupFunction(
	out,
	'_data',
	'_start',
	65536 ~/ lowChunkSize,
	highChunkSize,
	);
	out.writeln();
	}

	void _writeStringLiteral(
	StringSink out,
	String name,
	List<int> data, {
	required bool verbose,
	}) {
	var prefix = 'const String $name = ';
	out.write(prefix);
	var writer = StringLiteralWriter(out, padding: 4, escape: _needsEscape);
	writer.start(prefix.length);
	var bytes = 0;
	for (var i = 0; i < data.length; i++) {
	var char = data[i];
	writer.add(char);
	bytes += char <= 0xFF ? 1 : 2;
	}
	writer.end();
	out.write(';\n');
	if (verbose) {
	stderr.writeln('Writing $bytes bytes');
	}
	}

	bool _needsEscape(int codeUnit) =>
	codeUnit > 0xff \|\| codeUnit == 0x7f \|\| codeUnit & 0x60 == 0;

	void _writeLookupFunction(
	StringSink out,
	String dataName,
	String startName,
	int chunkSize,
	) {
	out.write(_lookupMethod('low', dataName, startName, chunkSize));
	}

	void _writeSurrogateLookupFunction(
	StringSink out,
	String dataName,
	String startName,
	int startOffset,
	int chunkSize,
	) {
	out.write(
	_lookupSurrogatesMethod(
	'high',
	dataName,
	startName,
	startOffset,
	chunkSize,
	),
	);
	}

	String _lookupMethod(
	String name,
	String dataName,
	String startName,
	int chunkSize,
	) =>
	'''
	$preferInline
	int $name(int codeUnit) {
	var chunkStart = $startName.codeUnitAt(codeUnit >> ${chunkSize.bitLength - 1});
	var index = chunkStart + (codeUnit & ${chunkSize - 1});
	return $dataName.codeUnitAt(index);
	}
	''';

	String _lookupSurrogatesMethod(
	String name,
	String dataName,
	String startName,
	int startOffset,
	int chunkSize,
	) {
	if (chunkSize == 1024) {
	return '''
	$preferInline
	int $name(int lead, int tail) {
	var chunkStart = $startName.codeUnitAt($startOffset + (0x3ff & lead));
	var index = chunkStart + (0x3ff & tail);
	return $dataName.codeUnitAt(index);
	}
	''';
	}
	var shift = chunkSize.bitLength - 1;
	var indexVar = chunkSize < 1024 ? 'tail' : 'offset';
	return '''
	$preferInline
	int $name(int lead, int tail) {
	var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift);
	var chunkStart = $startName.codeUnitAt(offset >> $shift);
	var index = chunkStart + ($indexVar & ${chunkSize - 1});
	return $dataName.codeUnitAt(index);
	}
	''';
	}

	// -----------------------------------------------------------------------------
	bool _validate(
	Uint8List table,
	IndirectTable indirectTable,
	int lowChunkSize,
	int highChunkSize, {
	required bool verbose,
	}) {
	var lowChunkCount = 65536 ~/ lowChunkSize;
	var lowChunkShift = lowChunkSize.bitLength - 1;
	var lowChunkMask = lowChunkSize - 1;
	for (var i = 0; i < 65536; i++) {
	var value = table[i];
	var entryIndex = i >> lowChunkShift;
	var entry = indirectTable.entries[entryIndex];
	var indirectValue = indirectTable.chunks[entry.chunkNumber]
	[entry.start + (i & lowChunkMask)];
	if (value != indirectValue) {
	stderr.writeln('$entryIndex: $entry');
	stderr.writeln(
	'Error: ${i.toRadixString(16)} -> Expected $value,'
	' was $indirectValue',
	);
	printIndirectTable(indirectTable);
	return false;
	}
	}
	var highChunkShift = highChunkSize.bitLength - 1;
	var highChunkMask = highChunkSize - 1;
	for (var i = 0x10000; i < 0x110000; i++) {
	var j = i - 0x10000;
	var value = table[i];
	var entryIndex = lowChunkCount + (j >> highChunkShift);
	var entry = indirectTable.entries[entryIndex];
	var indirectValue = indirectTable.chunks[entry.chunkNumber]
	[entry.start + (j & highChunkMask)];
	if (value != indirectValue) {
	stderr.writeln('$entryIndex: $entry');
	stderr.writeln(
	'Error: ${i.toRadixString(16)} -> Expected $value,'
	' was $indirectValue',
	);
	printIndirectTable(indirectTable);
	return false;
	}
	}
	if (verbose) {
	stderr.writeln('Table validation success');
	}
	return true;
	}

	void printIndirectTable(IndirectTable table) {
	stderr.writeln(
	"IT(chunks: ${table.chunks.map((x) => "#${x.length}")},"
	' entries: ${table.entries}',
	);
	}