blob: 637c35e20f956fccac2c60826dff950341c86762 [file] [log] [blame]
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An implementation of Liang's hyphenation algorithm.
*/
#ifndef U_USING_ICU_NAMESPACE
#define U_USING_ICU_NAMESPACE 0
#endif // U_USING_ICU_NAMESPACE
#include <memory>
#include <unordered_map>
#include <vector>
#include "unicode/locid.h"
#ifndef MINIKIN_HYPHENATOR_H
#define MINIKIN_HYPHENATOR_H
namespace minikin {
enum class HyphenationType : uint8_t {
// Note: There are implicit assumptions scattered in the code that DONT_BREAK
// is 0.
// Do not break.
DONT_BREAK = 0,
// Break the line and insert a normal hyphen.
BREAK_AND_INSERT_HYPHEN = 1,
// Break the line and insert an Armenian hyphen (U+058A).
BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
// Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
BREAK_AND_INSERT_MAQAF = 3,
// Break the line and insert a Canadian Syllabics hyphen (U+1400).
BREAK_AND_INSERT_UCAS_HYPHEN = 4,
// Break the line, but don't insert a hyphen. Used for cases when there is
// already a hyphen
// present or the script does not use a hyphen (e.g. in Malayalam).
BREAK_AND_DONT_INSERT_HYPHEN = 5,
// Break and replace the last code unit with hyphen. Used for Catalan "l·l"
// which hyphenates
// as "l-/l".
BREAK_AND_REPLACE_WITH_HYPHEN = 6,
// Break the line, and repeat the hyphen (which is the last character) at the
// beginning of the
// next line. Used in Polish, where "czerwono-niebieska" should hyphenate as
// "czerwono-/-niebieska".
BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the
// second line.
// This is used in Arabic script, mostly for writing systems of Central Asia.
// It's our default
// behavior when a soft hyphen is used in Arabic script.
BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
};
// The hyphen edit represents an edit to the string when a word is
// hyphenated. The most common hyphen edit is adding a "-" at the end
// of a syllable, but nonstandard hyphenation allows for more choices.
// Note that a HyphenEdit can hold two types of edits at the same time,
// One at the beginning of the string/line and one at the end.
class HyphenEdit {
public:
static const uint32_t NO_EDIT = 0x00;
static const uint32_t INSERT_HYPHEN_AT_END = 0x01;
static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02;
static const uint32_t INSERT_MAQAF_AT_END = 0x03;
static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04;
static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05;
static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06;
static const uint32_t BREAK_AT_END = 0x07;
static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3;
static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3;
static const uint32_t BREAK_AT_START = 0x03 << 3;
// Keep in sync with the definitions in the Java code at:
// frameworks/base/graphics/java/android/graphics/Paint.java
static const uint32_t MASK_END_OF_LINE = 0x07;
static const uint32_t MASK_START_OF_LINE = 0x03 << 3;
inline static bool isReplacement(uint32_t hyph) {
return hyph == REPLACE_WITH_HYPHEN_AT_END;
}
inline static bool isInsertion(uint32_t hyph) {
return (hyph == INSERT_HYPHEN_AT_END ||
hyph == INSERT_ARMENIAN_HYPHEN_AT_END ||
hyph == INSERT_MAQAF_AT_END || hyph == INSERT_UCAS_HYPHEN_AT_END ||
hyph == INSERT_ZWJ_AND_HYPHEN_AT_END ||
hyph == INSERT_HYPHEN_AT_START || hyph == INSERT_ZWJ_AT_START);
}
const static uint32_t* getHyphenString(uint32_t hyph);
static uint32_t editForThisLine(HyphenationType type);
static uint32_t editForNextLine(HyphenationType type);
HyphenEdit() : hyphen(NO_EDIT) {}
HyphenEdit(uint32_t hyphenInt) : hyphen(hyphenInt) {} // NOLINT(implicit)
uint32_t getHyphen() const { return hyphen; }
bool operator==(const HyphenEdit& other) const {
return hyphen == other.hyphen;
}
uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; }
uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; }
private:
uint32_t hyphen;
};
// hyb file header; implementation details are in the .cpp file
struct Header;
class Hyphenator {
public:
// Compute the hyphenation of a word, storing the hyphenation in result
// vector. Each entry in the vector is a "hyphenation type" for a potential
// hyphenation that can be applied at the corresponding code unit offset in
// the word.
//
// Example: word is "hyphen", result is the following, corresponding to
// "hy-phen": [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK,
// DONT_BREAK, DONT_BREAK]
void hyphenate(std::vector<HyphenationType>* result,
const uint16_t* word,
size_t len,
const icu::Locale& locale);
// Returns true if the codepoint is like U+2010 HYPHEN in line breaking and
// usage: a character immediately after which line breaks are allowed, but
// words containing it should not be automatically hyphenated.
static bool isLineBreakingHyphen(uint32_t cp);
// pattern data is in binary format, as described in doc/hyb_file_format.md.
// Note: the caller is responsible for ensuring that the lifetime of the
// pattern data is at least as long as the Hyphenator object.
// Note: nullptr is valid input, in which case the hyphenator only processes
// soft hyphens.
static Hyphenator* loadBinary(const uint8_t* patternData,
size_t minPrefix,
size_t minSuffix);
private:
// apply various hyphenation rules including hard and soft hyphens, ignoring
// patterns
void hyphenateWithNoPatterns(HyphenationType* result,
const uint16_t* word,
size_t len,
const icu::Locale& locale);
// Try looking up word in alphabet table, return DONT_BREAK if any code units
// fail to map. Otherwise, returns BREAK_AND_INSERT_HYPHEN,
// BREAK_AND_INSERT_ARMENIAN_HYPHEN, or BREAK_AND_DONT_INSERT_HYPHEN based on
// the script of the characters seen. Note that this method writes len+2
// entries into alpha_codes (including start and stop)
HyphenationType alphabetLookup(uint16_t* alpha_codes,
const uint16_t* word,
size_t len);
// calculate hyphenation from patterns, assuming alphabet lookup has already
// been done
void hyphenateFromCodes(HyphenationType* result,
const uint16_t* codes,
size_t len,
HyphenationType hyphenValue);
// See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is
// used so that temporary buffers can be stack-allocated without waste, which
// is a slightly different use case. It measures UTF-16 code units.
static const size_t MAX_HYPHENATED_SIZE = 64;
const uint8_t* patternData;
size_t minPrefix, minSuffix;
// accessors for binary data
const Header* getHeader() const {
return reinterpret_cast<const Header*>(patternData);
}
};
} // namespace minikin
#endif // MINIKIN_HYPHENATOR_H