blob: 7f42d922db99ebd48e54eaa011c9fd61cc4b72b5 [file] [log] [blame]
// Copyright (c) 2018, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
/// Unicode Grapheme Breaking Algorithm Character Categories.
/// (Order is irrelevant to correctness, so it is chosen
/// to minimize the size of the generated table strings
/// by avoiding many bytes that need escapes).
const int categoryCR = 0;
const int categoryControl = 1;
const int categoryOther = 2; // Any character not in any other category.
const int categoryExtend = 3;
const int categorySpacingMark = 4;
const int categoryRegionalIndicator = 5;
const int categoryPictographic = 6;
const int categoryLF = 7;
const int categoryPrepend = 8;
const int categoryL = 9;
const int categoryV = 10;
const int categoryT = 11;
const int categoryLV = 12;
const int categoryLVT = 13;
const int categoryOtherIndicConsonant = 14; // Other + InCB=Consonant.
const int categoryZWJ = 15; // Is also InCB=Extend.
const int categoryExtendIndicExtend = 16; // Extend + InCB=Extend.
const int categoryExtendIndicLinked = 17; // Extend + InCB=Linked.
const int categoryEoT = 18; // End of Text (synthetic input)
const int categoryCount = categoryEoT + 1;
const int inputCategoryCount = categoryEoT;
const int regionalIndicatorStart = 0x1F1E6; // A
const int regionalIndicatorEnd = 0x1F1FF; // Z
// Automaton states for forwards automaton.
/// Bit flag or'ed to the automaton output if there should not be a break
/// before the most recent input character.
const int flagNoBreak = 0;
const int flagBreak = 1;
const int maskBreak = 1;
/// Extra bit used to trigger or modify the effect of lookahead/lookbehind.
///
/// Requires [automatonRowLength] to be a multiple of 4.
/// It is currently 20.
const int flagLookahead = 2;
const int maskLookahead = 2;
/// Mask of entry in automatons without low flag bits.
const int maskFlags = maskLookahead | maskBreak;
const int maskState = ~maskFlags;
// For complex lookahead (Indic Ext/Lnk+Con, ZWJ+PIC), where to put the
// breaks and cursor afterwards.
const int flagLookaheadBreakNone = flagNoBreak;
const int flagLookaheadBreakEarly = flagBreak;
const int flagLookaheadBreakLate = flagLookahead | flagNoBreak; // Not used.
const int flagLookaheadBreakBoth = flagLookahead | flagBreak;
/// Automaton row length, number of input categories rounded up
/// to a multiple of `maskFlags + 1`, so that the state value
/// has room for flags in the low bits.
/// (Rather than having to right-shift the state to find the
/// table entry.)
/// All state integers are multiples of this value.
const automatonRowLength = (categoryCount + maskFlags) & maskState;
// Let states be the position of their entries in the automaton data.
// States of forwards automaton ---------------------------------------
// For each state, also have a `automatonRowLength...` for the value of that
// state that occurs in the automaton tables (and which is an index
// into the automaton tables).
/// Always break before next.
const int stateBreak = 0x00 * automatonRowLength;
/// Break unless next is LF.
const int stateCR = 0x01 * automatonRowLength;
/// Break unless next is Extend, ZWJ, SpacingMark.
const int stateOther = 0x02 * automatonRowLength;
/// Break only if next is Control/CR/LF/eot.
const int statePrepend = 0x03 * automatonRowLength;
/// As Other unless next is L, V, LV, LVT.
///
/// Seen `L+`
const int stateL = 0x04 * automatonRowLength;
/// As Other unless next is V, T.
/// Seen: `L* (LV|V) V*`
const int stateV = 0x05 * automatonRowLength;
/// As Other unless next is T.
///
/// Seen `L*(LV?V*T|LVT)T*`.
const int stateT = 0x06 * automatonRowLength;
/// As Other unless followed by Ext* ZWJ Pic.
const int statePictographic = 0x07 * automatonRowLength;
/// As Other unless followed by Pic.
const int statePictographicZWJ = 0x08 * automatonRowLength;
/// As Other unless followed by RI.
///
/// Unknown whether there is an even or odd number of prior RIs.
const int stateRegionalSingle = 0x09 * automatonRowLength;
/// As Other unless next is InCB=Extend|Linked|.
/// Has seen `{InCB=Consonant} {InCB=Extend}*`.
const int stateInC = 0x0A * automatonRowLength;
/// As Other unless InCB=Extend|Linked|Consonant.
/// Seen `{InCB=Consonant} {InCB=Extend}* {InCB=Linked} {InCB=Extend|Linked}*`.
/// Don't break before a following `{InCB=Consonant}`.
/// (Not used in backwards automaton).
const int stateInCL = 0x0B * automatonRowLength;
/// As SoT, but never cause break before next character.
///
/// Not reachable in automaton, only used as start state.
/// Used internally at start of inputs, which is automatically considered a
/// break anyway.
const int stateSoTNoBreak = 0x0C * automatonRowLength;
/// Start of text (or known start of grapheme).
///
/// Not reachable in automaton, only used as start state.
const int stateSoT = 0x0D * automatonRowLength;
// Context-unaware states in forward automaton.
// States that do not know what's behind the current sequence of Ext{InCB=?}+ZWJ
// sequence, and which may need to trigger a look-behind in some cases.
/// Start of context=unaware lookahead, no characters seen.
const int stateCAny = 0x0E * automatonRowLength;
/// Seen ZWJ only, as the first (prior) character.
const int stateCZWJ = 0x0F * automatonRowLength;
/// Seen Extend{InCB=Extend}+ only.
const int stateCIE = 0x10 * automatonRowLength;
/// Seen Extend{InCB=Extend|Lined}+, with at least one Linked
const int stateCIL = 0x11 * automatonRowLength;
/// Seen Extend{InCB=Extend}+ + ZWJ
const int stateCIEZ = 0x12 * automatonRowLength;
/// Seen Extend{InCB=Extend|Linked}+ + ZWJ with at least one Linked
const int stateCILZ = 0x13 * automatonRowLength;
/// Seen (Extend{InCB=Extend}|ZWJ)+ with at least one non-trailing ZWJ
const int stateCZIE = 0x14 * automatonRowLength;
/// Seen (Extend{InCB=Extend|Linked}|ZWJ)+
/// with at least one non-trailing ZWJ and at least one Linked.
const int stateCZIL = 0x15 * automatonRowLength;
/// Seen Extend{InCB=?}+ with at least one Extend{InCB=None}
const int stateCExt = 0x16 * automatonRowLength;
/// Seen Extend{InCB=?}+ + ZWJ with at least one Extend{InCB=None}
const int stateCExZ = 0x17 * automatonRowLength;
/// Seen single RegionalIndicator only.
const int stateCReg = 0x18 * automatonRowLength;
// --------------------------------------------------------------------
/// First state which might trigger look-behind.
const int stateMinContextUnaware = stateCAny;
/// Number of states in forward automaton.
const int stateLimit = stateCReg + automatonRowLength;
// ---------------------------------------------------------------------
// Backwards Automaton extra/alternative states and categories.
//
// Reuses state positions that are not used in backwards search,
// possibly because they are replaced by look-behind.
const int categorySoT = categoryEoT; // Start of Text (synthetic input)
/// Start of text (or grapheme).
const int stateEoT = stateSoT;
/// Break unless prev is CR.
const int stateLF = stateCR;
/// Only break if prev is Control/CR/LF/sot.
const int stateExtend = statePrepend;
/// As EoT but never cause break before.
const int stateEoTNoBreak = stateSoTNoBreak;
/// There is an even number of RIs before.
const int stateRegionalEven = stateInCL;
/// There is an odd (non-zero!) number of RIs before.
const int stateRegionalOdd = statePictographicZWJ;
// Backwards automaton sometimes needs to perform lookahead.
// The rules for grapheme cluster breaking can depend on knowing
// the categories of multiple *prior* code points. When getting to such a point
// during backwards movement, the automaton breaks out and runs specialized
// code that looks back on prior characters to decide whether the current
// position should break.
// (TODO: Also allow updating the position if it's known where the next break
// is in the scanned characters.)
// It triggers that by entering a synthetic state.
// After doing the lookahead, that synthetic state is replaced by a
// conventional state that allows it to proceed.
// The extra states are not part of the state machine.
/// Minimum state requesting a look-ahead.
const int stateLookaheadMin = stateLookaheadZWJPictographic;
/// State requesting a look-ahead for Pic Ext*.
const int stateLookaheadZWJPictographic = 0x0E * automatonRowLength;
/// State requesting a look-ahead for InCB consonant + InCB (Extend + inked)+
/// with at least one inked.
const int stateLookaheadInC = 0x0F * automatonRowLength;
/// State requesting a look-ahead for InCB consonant + InCB (Extend + inked)+
/// ending with a linked.
const int stateLookaheadInCL = 0x10 * automatonRowLength;
/// Look-ahead state for regional indicators, having seen an even number.
const int stateLookaheadRegionalEven = 0x11 * automatonRowLength;
/// Look-ahead state for regional indicators, having seen an odd number.
const int stateLookaheadRegionalOdd = 0x12 * automatonRowLength;
/// Limit on the entries of states in backwards automaton.
const int backStateLimit = stateLookaheadRegionalOdd + automatonRowLength;