| // BSD-style license that can be found in the LICENSE file. |
| |
| /// API for accessing the generated state and category tables. |
| /// |
| /// Use only [Breaks], [BackBreaks] and [nextBreak] from files other than this. |
| /// Do not use any function from `table.dart` directly. |
| /// (In both cases this package's own testing is exempted.) |
| |
| library; |
| |
| import 'constants.dart'; |
| import 'table.dart'; |
| |
| /// Iterates grapheme cluster breaks of a string. |
| /// |
| /// Iterates the grapheme cluster breaks of the substring of |
| /// [base] from [cursor] to [end]. |
| /// |
| /// To iterate a substring, use: |
| /// ```dart |
| /// var breaks = Breaks(string, start, end, stateSoT); |
| /// int brk = 0; |
| /// while((brk = breaks.nextBreak) >= 0) { |
| /// print('Break at index $brk'); |
| /// } |
| /// ``` |
| /// If you use [stateSoTNoBreak] instead of [stateSoT], the |
| /// initial break between the start-of-text and the first grapheme |
| /// is suppressed. |
| class Breaks { |
| /// Text being iterated. |
| final String base; |
| |
| /// end of substring of [base] being iterated. |
| final int end; |
| |
| /// Position of the first yet-unprocessed code point. |
| int cursor; |
| |
| /// Current state based on code points processed so far. |
| /// |
| /// A state value is a multiple of [automatonRowLength] plus possibly |
| /// a few bits of flags. |
| int state; |
| |
| Breaks(this.base, this.cursor, this.end, this.state); |
| |
| /// Creates a copy of the current iteration, at the exact same state. |
| Breaks copy() => Breaks(base, cursor, end, state); |
| |
| /// The index of the next grapheme cluster break in last-to-first index order. |
| /// |
| /// Returns a negative number if there are no further breaks, |
| /// which means that [cursor] has reached [end]. |
| /// |
| /// Also stops if reaching a state with the `flagLookahead` bit set, |
| /// with the returned position being before the character which triggered |
| /// that look-behind. |
| /// If the state is not one which can trigger a look-behind, the exit position |
| /// is always the next break (if any, or -1 if none, which only happens on |
| /// empty strings.) |
| int nextBreak() { |
| while (cursor < end) { |
| var breakAt = cursor; |
| step(); |
| if (state & maskFlags != flagNoBreak) { |
| return breakAt; |
| } |
| } |
| state = move(state, categoryEoT); |
| if (state & maskFlags != flagNoBreak) return cursor; |
| return -1; |
| } |
| |
| /// Takes one step forward in the state machine. |
| void step() { |
| assert(cursor < end); |
| var char = base.codeUnitAt(cursor++); |
| var surrogate = char ^ 0xD800; |
| if (surrogate > 0x3FF) { |
| state = move(state, low(char)); |
| return; |
| } |
| // The category of an unpaired lead surrogate is Control. |
| int category; |
| int nextSurrogate; |
| if (cursor < end && |
| (nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { |
| category = high(surrogate, nextSurrogate); |
| cursor++; |
| } else { |
| category = categoryControl; |
| } |
| state = move(state, category); |
| } |
| |
| /// Start with no knowledge about the position at [cursor]. |
| /// |
| /// Starts from state `CAny` and takes one step based on the |
| /// latest character starting earlier than [cursor]. |
| /// |
| /// Can be used if the [cursor] isn't even known to be at |
| /// a grapheme cluster boundary. |
| /// |
| /// Returns the start of that prior character, which is |
| /// one of `cursor` (if cursor is at start of input) or |
| /// `cursor - 1`or `cursor - 2` depending whether |
| /// it is a surrogate pair, and if so, where it ends. |
| int _unknownPositionFirstStep(int start) { |
| if (cursor == start) { |
| state = stateSoTNoBreak; |
| return cursor; |
| } |
| var cursorBefore = cursor - 1; |
| var prevChar = base.codeUnitAt(cursorBefore); |
| var prevSurrogate = prevChar ^ 0xD800; |
| if (prevSurrogate > 0x7FF) { |
| // Not surrogate. |
| var prevCategory = low(prevChar); |
| state = move(stateCAny, prevCategory); |
| return cursorBefore; |
| } |
| int prevCategory; |
| if (prevSurrogate > 0x3FF) { |
| // Tail surrogate, check for prior lead surrogate. |
| int leadSurrogate; |
| var leadIndex = cursorBefore - 1; |
| prevSurrogate &= 0x3FF; |
| if (leadIndex >= start && |
| (leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) { |
| prevCategory = high(leadSurrogate, prevSurrogate); |
| cursorBefore = leadIndex; |
| } else { |
| prevCategory = categoryControl; |
| } |
| } else { |
| // Lead surrogate. Check for a following tail surrogate. |
| int tailSurrogate; |
| if (cursor < end && |
| (tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { |
| cursor += 1; |
| prevCategory = high(prevSurrogate, tailSurrogate); |
| } else { |
| prevCategory = categoryControl; |
| } |
| } |
| state = move(stateCAny, prevCategory); |
| return cursorBefore; |
| } |
| } |
| |
| /// Iterates grapheme cluster breaks backwards. |
| /// |
| /// Given a substring of a [base] string from [start] to [cursor], |
| /// iterates the grapheme cluster breaks from [cursor] to [start]. |
| /// |
| /// To iterate a substring, do |
| /// ```dart |
| /// var breaks = BackBreaks(string, start, end, idStateEoT); |
| /// int brk = 0; |
| /// while ((brk = breaks.nextBreak()) >= 0) { |
| /// print('Break at index $brk'); |
| /// } |
| /// ``` |
| /// If the initial [state] is [stateEoTNoBreak] instead of [stateEoT], |
| /// the initial break between the last grapheme and the end-of-text |
| /// is suppressed. |
| class BackBreaks { |
| /// Text being iterated. |
| final String base; |
| |
| /// Start of substring of [base] being iterated. |
| final int start; |
| |
| /// Position after the last yet-unprocessed code point. |
| int cursor; |
| |
| /// Current state based on code points processed so far. |
| int state; |
| |
| BackBreaks(this.base, this.cursor, this.start, this.state); |
| |
| BackBreaks copy() => BackBreaks(base, cursor, start, state); |
| |
| /// The index of the next grapheme cluster break in first-to-last index order. |
| /// |
| /// Returns a negative number if there are no further breaks, |
| /// which means that [cursor] was already at [start]. |
| int nextBreak() { |
| while (cursor > start) { |
| var breakAt = cursor; |
| step(); |
| if (state & maskFlags == flagNoBreak) { |
| continue; |
| } |
| if (state & maskLookahead != 0) { |
| _lookaheadInNextBreak(); |
| } |
| if (state & maskBreak != flagNoBreak) { |
| return breakAt; |
| } |
| } |
| state = moveBack(state, categoryEoT); |
| assert(state < stateLookaheadMin, state); |
| if (state & maskBreak != flagNoBreak) return cursor; |
| return -1; |
| } |
| |
| /// Reads a single code point before [cursor] and transition on it. |
| /// |
| /// Puts cursor before the code point. |
| void step() { |
| assert(cursor > start); |
| var char = base.codeUnitAt(--cursor); |
| var surrogate = char ^ 0xDC00; |
| if (surrogate > 0x3FF) { |
| var category = low(char); |
| state = moveBack(state, category); |
| return; |
| } |
| // Found tail surrogate, check for prior lead surrogate. |
| // The category of an unpaired tail surrogate is Control. |
| int category; |
| int prevSurrogate; |
| if (cursor >= start && |
| (prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) { |
| category = high(prevSurrogate, surrogate); |
| } else { |
| category = categoryControl; |
| cursor++; |
| } |
| state = moveBack(state, category); |
| } |
| |
| /// Steps back using lookahead states. |
| /// |
| /// Returns the position before the last scanned character. |
| /// (Because in some cases the next break will be at that point.) |
| int _lookahead() { |
| assert(state >= stateLookaheadMin); |
| while (cursor > start) { |
| var cursorBeforeLast = cursor; |
| step(); |
| if (state < stateLookaheadMin) return cursorBeforeLast; |
| } |
| state = moveBack(state, categorySoT); |
| assert(state < stateLookaheadMin, state); |
| return start; |
| } |
| |
| /// Called from [nextBreak] to perform a lookahead, and set the result state. |
| /// |
| /// After this call, the state has [flagBreak] set if it should break |
| /// between the two characters which triggered lookahead. |
| /// The state and cursor are set to a position prior to reaching the next |
| /// break. |
| void _lookaheadInNextBreak() { |
| assert(state >= stateLookaheadMin, state); |
| // To check if this was a regional lookahead afterwards. |
| var preState = state; |
| // Regional lookahead resets to this position. |
| var preCursor = cursor; |
| // Non-regional lookahead may reset to the position before the last seen, |
| // to avoid having to report two breaks. |
| var breakAt = _lookahead(); |
| |
| if (preState >= stateLookaheadRegionalEven) { |
| // Result is always one of one of flagBreak or flagNoBreak. |
| assert( |
| preState == (stateLookaheadRegionalOdd | flagLookahead) || |
| preState == (stateLookaheadRegionalEven | flagLookahead), |
| preState, |
| ); |
| assert( |
| state == (stateRegionalEven | flagNoBreak) || |
| state == (stateRegionalOdd | flagBreak), |
| ); |
| // Always reset cursor for regional lookahead. |
| // (Could detect stateRegionalOdd, decrease cursor two positions and |
| // switch to stateRegionalEven. Not worth the extra code.) |
| cursor = preCursor; |
| } else { |
| // Flags mean: |
| // flagNoBreak: Do not break before position, or before cursor. |
| // flagBreak: break at position before lookahead, keep cursor. |
| // flagLookahead: Not used. |
| // flagBreak+flagLookahead: Break at position before lookahead, |
| // set cursor to reread the last character before cursor |
| // (because it'll break again there.) |
| |
| // Keep cursor at or just before last read. |
| if (state & maskFlags == flagLookaheadBreakBoth) { |
| cursor = breakAt; |
| } |
| } |
| } |
| } |
| |
| /// Whether there is a grapheme cluster boundary before [index] in [text]. |
| /// |
| /// This is a low-level function. There is no validation of the arguments. |
| /// They should satisfy `0 <= start <= index <= end <= text.length`. |
| /// |
| /// Allows [index] to not be at a grapheme cluster boundary |
| /// (or even a code point boundary). |
| bool isGraphemeClusterBoundary(String text, int start, int end, int index) { |
| assert(0 <= start); |
| assert(start <= index); |
| assert(index <= end); |
| assert(end <= text.length); |
| // Uses the backwards automaton because answering the question |
| // might be answered by looking only at the code points around the |
| // index, but it may also require looking further back. It never |
| // requires looking further ahead, though. |
| // The backwards automaton is built for this use case. |
| // Most of the apparent complication in this function is merely dealing with |
| // surrogates. |
| if (start < index && index < end) { |
| var breaks = Breaks(text, index, end, stateCAny); |
| var cursorBefore = breaks._unknownPositionFirstStep(start); |
| // If cursor moved, index is in the middle of a surrogate pair. |
| if (breaks.cursor != index) return false; |
| breaks.step(); |
| if (breaks.state & maskBreak != flagNoBreak) { |
| return true; |
| } |
| if (breaks.state & maskLookahead == 0) return false; |
| assert(breaks.state >= stateLookaheadMin); |
| |
| var backBreaks = BackBreaks(text, cursorBefore, start, breaks.state); |
| backBreaks._lookahead(); |
| return (backBreaks.state & maskBreak != flagNoBreak); |
| } |
| return true; |
| } |
| |
| /// The most recent break no later than [index] in |
| /// `string.substring(start, end)`. |
| /// |
| /// Allows [index] to not be at a grapheme cluster boundary |
| /// (or even a code point boundary). |
| int previousBreak(String text, int start, int end, int index) { |
| assert(0 <= start); |
| assert(start <= index); |
| assert(index <= end); |
| assert(end <= text.length); |
| // First character ending after `index`. |
| // Accounts for an `index` in the middle of a surrogate pair. |
| if (start < index && index < end) { |
| var cursorBefore = index; |
| var nextChar = text.codeUnitAt(index); |
| var nextSurrogate = nextChar ^ 0xD800; |
| var category = categoryControl; |
| if (nextSurrogate > 0x7FF) { |
| category = low(nextChar); |
| } else if (nextSurrogate <= 0x3FF) { |
| var indexAfter = index + 1; |
| if (indexAfter < end) { |
| var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00; |
| if (secondSurrogate <= 0x3FF) { |
| category = high(nextSurrogate, secondSurrogate); |
| } |
| } |
| } else { |
| var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800; |
| nextSurrogate &= 0x3FF; |
| if (prevSurrogate <= 0x3FF) { |
| category = high(prevSurrogate, nextSurrogate); |
| cursorBefore -= 1; |
| } |
| } |
| return BackBreaks( |
| text, |
| cursorBefore, |
| start, |
| moveBack(stateEoTNoBreak, category), |
| ).nextBreak(); |
| } |
| return index; |
| } |
| |
| /// The next break no earlier than [index] in `string.substring(start, end)`. |
| /// |
| /// Allows [index] to not be at a grapheme cluster boundary |
| /// (or even a code point boundary). |
| int nextBreak(String text, int start, int end, int index) { |
| assert(0 <= start); |
| assert(start <= index); |
| assert(index <= end); |
| assert(end <= text.length); |
| // Always break at start or end (GB1). |
| if (index == start || index == end) return index; |
| var breaks = Breaks(text, index, end, stateCAny); |
| var cursorBefore = breaks._unknownPositionFirstStep(start); |
| var possibleBreak = breaks.nextBreak(); |
| assert(breaks.state & maskFlags != 0); |
| if (breaks.state & maskFlags == flagBreak) return possibleBreak; |
| var lookbehindState = breaks.state; |
| assert(lookbehindState & maskFlags == flagLookahead); |
| assert(lookbehindState & maskState >= stateLookaheadMin); |
| |
| var backBreaks = BackBreaks(text, cursorBefore, start, lookbehindState); |
| backBreaks._lookahead(); |
| if (backBreaks.state & maskBreak != flagNoBreak) { |
| return possibleBreak; |
| } |
| |
| // Find the correct forward category to continue with. |
| // There are only three possible character categories that can trigger |
| // a look-behind. |
| if (lookbehindState == stateLookaheadRegionalEven | flagLookahead) { |
| assert(backBreaks.state == stateRegionalEven); |
| // Started by RI+RI. |
| breaks.state = stateRegionalEven; |
| } else { |
| // Was triggered by ZWJ+Pic or InCB={Extend|Linked}+InCB=Consonant. |
| assert( |
| lookbehindState == (stateLookaheadZWJPictographic | flagLookahead) || |
| lookbehindState == (stateLookaheadInC | flagLookahead) || |
| lookbehindState == (stateLookaheadInCL | flagLookahead), |
| ); |
| // If starting in lookahead state ZWJ+Pic, and not breaking, |
| // final backwards state is Pic. |
| assert( |
| lookbehindState != (stateLookaheadZWJPictographic | flagLookahead) || |
| backBreaks.state == statePictographic, |
| ); |
| // If starting in lookahead state InC or InCL, and not breaking, |
| // final backwards state is Inc. |
| assert( |
| lookbehindState != (stateLookaheadInC | flagLookahead) && |
| lookbehindState != (stateLookaheadInCL | flagLookahead) || |
| backBreaks.state == stateInC, |
| ); |
| // In both cases, that's the same as the forward state |
| // at the point that triggered the look-behind. |
| breaks.state = backBreaks.state; |
| } |
| var result = breaks.nextBreak(); |
| assert(breaks.state & maskFlags == flagBreak); |
| return result; |
| } |