1/*
2 * (C) 1999 Lars Knoll ([email protected])
3 * Copyright (C) 2004-2016 Apple Inc. All rights reserved.
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22#include "config.h"
23#include <wtf/text/TextBreakIterator.h>
24
25#include <wtf/text/LineBreakIteratorPoolICU.h>
26#include <wtf/text/TextBreakIteratorInternalICU.h>
27#include <wtf/text/icu/UTextProviderLatin1.h>
28#include <wtf/text/icu/UTextProviderUTF16.h>
29#include <atomic>
30#include <mutex>
31#include <unicode/ubrk.h>
32#include <wtf/text/StringBuilder.h>
33
34namespace WTF {
35
36#if !PLATFORM(MAC) && !PLATFORM(IOS_FAMILY)
37
38static Variant<TextBreakIteratorICU, TextBreakIteratorPlatform> mapModeToBackingIterator(StringView string, TextBreakIterator::Mode mode, const AtomString& locale)
39{
40 switch (mode) {
41 case TextBreakIterator::Mode::Line:
42 return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Line, locale.string().utf8().data());
43 case TextBreakIterator::Mode::Caret:
44 return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data());
45 case TextBreakIterator::Mode::Delete:
46 return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data());
47 default:
48 ASSERT_NOT_REACHED();
49 return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data());
50 }
51}
52
53TextBreakIterator::TextBreakIterator(StringView string, Mode mode, const AtomString& locale)
54 : m_backing(mapModeToBackingIterator(string, mode, locale))
55 , m_mode(mode)
56 , m_locale(locale)
57{
58}
59
60#endif
61
62// Iterator initialization
63
64static UBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID())
65{
66 UErrorCode openStatus = U_ZERO_ERROR;
67 UBreakIterator* iterator = ubrk_open(type, locale, 0, 0, &openStatus);
68 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
69 return iterator;
70}
71
72// Iterator text setting
73
74static UBreakIterator* setTextForIterator(UBreakIterator& iterator, StringView string)
75{
76 if (string.is8Bit()) {
77 UTextWithBuffer textLocal;
78 textLocal.text = UTEXT_INITIALIZER;
79 textLocal.text.extraSize = sizeof(textLocal.buffer);
80 textLocal.text.pExtra = textLocal.buffer;
81
82 UErrorCode openStatus = U_ZERO_ERROR;
83 UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus);
84 if (U_FAILURE(openStatus)) {
85 LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus);
86 return nullptr;
87 }
88
89 UErrorCode setTextStatus = U_ZERO_ERROR;
90 ubrk_setUText(&iterator, text, &setTextStatus);
91 if (U_FAILURE(setTextStatus)) {
92 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
93 return nullptr;
94 }
95
96 utext_close(text);
97 } else {
98 UErrorCode setTextStatus = U_ZERO_ERROR;
99 ubrk_setText(&iterator, string.characters16(), string.length(), &setTextStatus);
100 if (U_FAILURE(setTextStatus))
101 return nullptr;
102 }
103
104 return &iterator;
105}
106
107static UBreakIterator* setContextAwareTextForIterator(UBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength)
108{
109 if (string.is8Bit()) {
110 UTextWithBuffer textLocal;
111 textLocal.text = UTEXT_INITIALIZER;
112 textLocal.text.extraSize = sizeof(textLocal.buffer);
113 textLocal.text.pExtra = textLocal.buffer;
114
115 UErrorCode openStatus = U_ZERO_ERROR;
116 UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus);
117 if (U_FAILURE(openStatus)) {
118 LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus);
119 return nullptr;
120 }
121
122 UErrorCode setTextStatus = U_ZERO_ERROR;
123 ubrk_setUText(&iterator, text, &setTextStatus);
124 if (U_FAILURE(setTextStatus)) {
125 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
126 return nullptr;
127 }
128
129 utext_close(text);
130 } else {
131 UText textLocal = UTEXT_INITIALIZER;
132
133 UErrorCode openStatus = U_ZERO_ERROR;
134 UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus);
135 if (U_FAILURE(openStatus)) {
136 LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus);
137 return 0;
138 }
139
140 UErrorCode setTextStatus = U_ZERO_ERROR;
141 ubrk_setUText(&iterator, text, &setTextStatus);
142 if (U_FAILURE(setTextStatus)) {
143 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
144 return nullptr;
145 }
146
147 utext_close(text);
148 }
149
150 return &iterator;
151}
152
153
154// Static iterators
155
156UBreakIterator* wordBreakIterator(StringView string)
157{
158 static UBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD);
159 if (!staticWordBreakIterator)
160 return nullptr;
161
162 return setTextForIterator(*staticWordBreakIterator, string);
163}
164
165UBreakIterator* sentenceBreakIterator(StringView string)
166{
167 static UBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE);
168 if (!staticSentenceBreakIterator)
169 return nullptr;
170
171 return setTextForIterator(*staticSentenceBreakIterator, string);
172}
173
174UBreakIterator* acquireLineBreakIterator(StringView string, const AtomString& locale, const UChar* priorContext, unsigned priorContextLength, LineBreakIteratorMode mode)
175{
176 UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale, mode);
177 if (!iterator)
178 return nullptr;
179
180 return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength);
181}
182
183void releaseLineBreakIterator(UBreakIterator* iterator)
184{
185 ASSERT_ARG(iterator, iterator);
186
187 LineBreakIteratorPool::sharedPool().put(iterator);
188}
189
190UBreakIterator* openLineBreakIterator(const AtomString& locale)
191{
192 bool localeIsEmpty = locale.isEmpty();
193 UErrorCode openStatus = U_ZERO_ERROR;
194 UBreakIterator* ubrkIter = ubrk_open(UBRK_LINE, localeIsEmpty ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus);
195 // locale comes from a web page and it can be invalid, leading ICU
196 // to fail, in which case we fall back to the default locale.
197 if (!localeIsEmpty && U_FAILURE(openStatus)) {
198 openStatus = U_ZERO_ERROR;
199 ubrkIter = ubrk_open(UBRK_LINE, currentTextBreakLocaleID(), 0, 0, &openStatus);
200 }
201
202 if (U_FAILURE(openStatus)) {
203 LOG_ERROR("ubrk_open failed with status %d", openStatus);
204 return nullptr;
205 }
206
207 return ubrkIter;
208}
209
210void closeLineBreakIterator(UBreakIterator*& iterator)
211{
212 UBreakIterator* ubrkIter = iterator;
213 ASSERT(ubrkIter);
214 ubrk_close(ubrkIter);
215 iterator = nullptr;
216}
217
218static std::atomic<UBreakIterator*> nonSharedCharacterBreakIterator = ATOMIC_VAR_INIT(nullptr);
219
220static inline UBreakIterator* getNonSharedCharacterBreakIterator()
221{
222 if (auto *res = nonSharedCharacterBreakIterator.exchange(nullptr, std::memory_order_acquire))
223 return res;
224 return initializeIterator(UBRK_CHARACTER);
225}
226
227static inline void cacheNonSharedCharacterBreakIterator(UBreakIterator* cacheMe)
228{
229 if (auto *old = nonSharedCharacterBreakIterator.exchange(cacheMe, std::memory_order_release))
230 ubrk_close(old);
231}
232
233NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string)
234{
235 if ((m_iterator = getNonSharedCharacterBreakIterator()))
236 m_iterator = setTextForIterator(*m_iterator, string);
237}
238
239NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
240{
241 if (m_iterator)
242 cacheNonSharedCharacterBreakIterator(m_iterator);
243}
244
245NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(NonSharedCharacterBreakIterator&& other)
246 : m_iterator(nullptr)
247{
248 std::swap(m_iterator, other.m_iterator);
249}
250
251// Iterator implemenation.
252
253bool isWordTextBreak(UBreakIterator* iterator)
254{
255 int ruleStatus = ubrk_getRuleStatus(iterator);
256 return ruleStatus != UBRK_WORD_NONE;
257}
258
259unsigned numGraphemeClusters(StringView string)
260{
261 unsigned stringLength = string.length();
262
263 if (!stringLength)
264 return 0;
265
266 // The only Latin-1 Extended Grapheme Cluster is CRLF.
267 if (string.is8Bit()) {
268 auto* characters = string.characters8();
269 unsigned numCRLF = 0;
270 for (unsigned i = 1; i < stringLength; ++i)
271 numCRLF += characters[i - 1] == '\r' && characters[i] == '\n';
272 return stringLength - numCRLF;
273 }
274
275 NonSharedCharacterBreakIterator iterator { string };
276 if (!iterator) {
277 ASSERT_NOT_REACHED();
278 return stringLength;
279 }
280
281 unsigned numGraphemeClusters = 0;
282 while (ubrk_next(iterator) != UBRK_DONE)
283 ++numGraphemeClusters;
284 return numGraphemeClusters;
285}
286
287unsigned numCodeUnitsInGraphemeClusters(StringView string, unsigned numGraphemeClusters)
288{
289 unsigned stringLength = string.length();
290
291 if (stringLength <= numGraphemeClusters)
292 return stringLength;
293
294 // The only Latin-1 Extended Grapheme Cluster is CRLF.
295 if (string.is8Bit()) {
296 auto* characters = string.characters8();
297 unsigned i, j;
298 for (i = 0, j = 0; i < numGraphemeClusters && j + 1 < stringLength; ++i, ++j)
299 j += characters[j] == '\r' && characters[j + 1] == '\n';
300 return j + (i < numGraphemeClusters);
301 }
302
303 NonSharedCharacterBreakIterator iterator { string };
304 if (!iterator) {
305 ASSERT_NOT_REACHED();
306 return stringLength;
307 }
308
309 for (unsigned i = 0; i < numGraphemeClusters; ++i) {
310 if (ubrk_next(iterator) == UBRK_DONE)
311 return stringLength;
312 }
313 return ubrk_current(iterator);
314}
315
316} // namespace WTF
317