blob: 6df70db3f1a505dfd361103fe8e1e5af5e8d48a0 [file] [log] [blame]
// Copyright 2013 The Flutter Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// @dart = 2.6
part of engine;
enum _FindBreakDirection {
/// Indicates to find the word break by looking forward.
forward,
/// Indicates to find the word break by looking backward.
backward,
}
/// [WordBreaker] exposes static methods to identify word boundaries.
abstract class WordBreaker {
/// It starts from [index] and tries to find the next word boundary in [text].
static int nextBreakIndex(String text, int index) =>
_findBreakIndex(_FindBreakDirection.forward, text, index);
/// It starts from [index] and tries to find the previous word boundary in
/// [text].
static int prevBreakIndex(String text, int index) =>
_findBreakIndex(_FindBreakDirection.backward, text, index);
static int _findBreakIndex(
_FindBreakDirection direction,
String text,
int index,
) {
int step, min, max;
if (direction == _FindBreakDirection.forward) {
step = 1;
min = 0;
max = text.length - 1;
} else {
step = -1;
min = 1;
max = text.length;
}
int i = index;
while (i >= min && i <= max) {
i += step;
if (_isBreak(text, i)) {
break;
}
}
return i;
}
/// Find out if there's a word break between [index - 1] and [index].
/// http://unicode.org/reports/tr29/#Word_Boundary_Rules
static bool _isBreak(String text, int index) {
// Break at the start and end of text.
// WB1: sot ÷ Any
// WB2: Any ÷ eot
if (index <= 0 || index >= text.length) {
return true;
}
// Do not break inside surrogate pair
if (_isUtf16Surrogate(text.codeUnitAt(index - 1))) {
return false;
}
final WordCharProperty immediateRight = wordLookup.find(text, index);
WordCharProperty immediateLeft = wordLookup.find(text, index - 1);
// Do not break within CRLF.
// WB3: CR × LF
if (immediateLeft == WordCharProperty.CR && immediateRight == WordCharProperty.LF)
return false;
// Otherwise break before and after Newlines (including CR and LF)
// WB3a: (Newline | CR | LF) ÷
if (_oneOf(
immediateLeft,
WordCharProperty.Newline,
WordCharProperty.CR,
WordCharProperty.LF,
)) {
return true;
}
// WB3b: ÷ (Newline | CR | LF)
if (_oneOf(
immediateRight,
WordCharProperty.Newline,
WordCharProperty.CR,
WordCharProperty.LF,
)) {
return true;
}
// WB3c: ZWJ × \p{Extended_Pictographic}
// TODO(flutter_web): What's the right way to implement this?
// Keep horizontal whitespace together.
// WB3d: WSegSpace × WSegSpace
if (immediateLeft == WordCharProperty.WSegSpace &&
immediateRight == WordCharProperty.WSegSpace) {
return false;
}
// Ignore Format and Extend characters, except after sot, CR, LF, and
// Newline.
// WB4: X (Extend | Format | ZWJ)* → X
if (_oneOf(
immediateRight,
WordCharProperty.Extend,
WordCharProperty.Format,
WordCharProperty.ZWJ,
)) {
// The Extend|Format|ZWJ character is to the right, so it is attached
// to a character to the left, don't split here
return false;
}
// We've reached the end of an Extend|Format|ZWJ sequence, collapse it.
int l = 0;
while (_oneOf(
immediateLeft,
WordCharProperty.Extend,
WordCharProperty.Format,
WordCharProperty.ZWJ,
)) {
l++;
if (index - l - 1 < 0) {
// Reached the beginning of text.
return true;
}
immediateLeft = wordLookup.find(text, index - l - 1);
}
// Do not break between most letters.
// WB5: (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
if (_isAHLetter(immediateLeft) && _isAHLetter(immediateRight)) {
return false;
}
// Some tests beyond this point require more context. We need to get that
// context while also respecting rule WB4. So ignore Format, Extend and ZWJ.
// Skip all Format, Extend and ZWJ to the right.
int r = 0;
WordCharProperty nextRight;
do {
r++;
nextRight = wordLookup.find(text, index + r);
} while (_oneOf(
nextRight,
WordCharProperty.Extend,
WordCharProperty.Format,
WordCharProperty.ZWJ,
));
// Skip all Format, Extend and ZWJ to the left.
WordCharProperty nextLeft;
do {
l++;
nextLeft = wordLookup.find(text, index - l - 1);
} while (_oneOf(
nextLeft,
WordCharProperty.Extend,
WordCharProperty.Format,
WordCharProperty.ZWJ,
));
// Do not break letters across certain punctuation.
// WB6: (AHLetter) × (MidLetter | MidNumLet | Single_Quote) (AHLetter)
if (_isAHLetter(immediateLeft) &&
_oneOf(
immediateRight,
WordCharProperty.MidLetter,
WordCharProperty.MidNumLet,
WordCharProperty.SingleQuote,
) &&
_isAHLetter(nextRight)) {
return false;
}
// WB7: (AHLetter) (MidLetter | MidNumLet | Single_Quote) × (AHLetter)
if (_isAHLetter(nextLeft) &&
_oneOf(
immediateLeft,
WordCharProperty.MidLetter,
WordCharProperty.MidNumLet,
WordCharProperty.SingleQuote,
) &&
_isAHLetter(immediateRight)) {
return false;
}
// WB7a: Hebrew_Letter × Single_Quote
if (immediateLeft == WordCharProperty.HebrewLetter &&
immediateRight == WordCharProperty.SingleQuote) {
return false;
}
// WB7b: Hebrew_Letter × Double_Quote Hebrew_Letter
if (immediateLeft == WordCharProperty.HebrewLetter &&
immediateRight == WordCharProperty.DoubleQuote &&
nextRight == WordCharProperty.HebrewLetter) {
return false;
}
// WB7c: Hebrew_Letter Double_Quote × Hebrew_Letter
if (nextLeft == WordCharProperty.HebrewLetter &&
immediateLeft == WordCharProperty.DoubleQuote &&
immediateRight == WordCharProperty.HebrewLetter) {
return false;
}
// Do not break within sequences of digits, or digits adjacent to letters
// (“3a”, or “A3”).
// WB8: Numeric × Numeric
if (immediateLeft == WordCharProperty.Numeric &&
immediateRight == WordCharProperty.Numeric) {
return false;
}
// WB9: AHLetter × Numeric
if (_isAHLetter(immediateLeft) && immediateRight == WordCharProperty.Numeric)
return false;
// WB10: Numeric × AHLetter
if (immediateLeft == WordCharProperty.Numeric && _isAHLetter(immediateRight))
return false;
// Do not break within sequences, such as “3.2” or “3,456.789”.
// WB11: Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
if (nextLeft == WordCharProperty.Numeric &&
_oneOf(
immediateLeft,
WordCharProperty.MidNum,
WordCharProperty.MidNumLet,
WordCharProperty.SingleQuote,
) &&
immediateRight == WordCharProperty.Numeric) {
return false;
}
// WB12: Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
if (immediateLeft == WordCharProperty.Numeric &&
_oneOf(
immediateRight,
WordCharProperty.MidNum,
WordCharProperty.MidNumLet,
WordCharProperty.SingleQuote,
) &&
nextRight == WordCharProperty.Numeric) {
return false;
}
// Do not break between Katakana.
// WB13: Katakana × Katakana
if (immediateLeft == WordCharProperty.Katakana &&
immediateRight == WordCharProperty.Katakana) {
return false;
}
// Do not break from extenders.
// WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
if (_oneOf(
immediateLeft,
WordCharProperty.ALetter,
WordCharProperty.HebrewLetter,
WordCharProperty.Numeric,
WordCharProperty.Katakana,
WordCharProperty.ExtendNumLet,
) &&
immediateRight == WordCharProperty.ExtendNumLet) {
return false;
}
// WB13b: ExtendNumLet × (AHLetter | Numeric | Katakana)
if (immediateLeft == WordCharProperty.ExtendNumLet &&
_oneOf(
immediateRight,
WordCharProperty.ALetter,
WordCharProperty.HebrewLetter,
WordCharProperty.Numeric,
WordCharProperty.Katakana,
)) {
return false;
}
// Do not break within emoji flag sequences. That is, do not break between
// regional indicator (RI) symbols if there is an odd number of RI
// characters before the break point.
// WB15: sot (RI RI)* RI × RI
// TODO(mdebbar): implement this.
// WB16: [^RI] (RI RI)* RI × RI
// TODO(mdebbar): implement this.
// Otherwise, break everywhere (including around ideographs).
// WB999: Any ÷ Any
return true;
}
static bool _isUtf16Surrogate(int value) {
return value & 0xF800 == 0xD800;
}
static bool _oneOf(
WordCharProperty value,
WordCharProperty choice1,
WordCharProperty choice2, [
WordCharProperty choice3,
WordCharProperty choice4,
WordCharProperty choice5,
]) {
if (value == choice1) {
return true;
}
if (value == choice2) {
return true;
}
if (choice3 != null && value == choice3) {
return true;
}
if (choice4 != null && value == choice4) {
return true;
}
if (choice5 != null && value == choice5) {
return true;
}
return false;
}
static bool _isAHLetter(WordCharProperty property) {
return _oneOf(property, WordCharProperty.ALetter, WordCharProperty.HebrewLetter);
}
}