1 | /* |
2 | * (C) 1999 Lars Knoll ([email protected]) |
3 | * Copyright (C) 2004-2016 Apple Inc. All rights reserved. |
4 | * Copyright (C) 2007-2009 Torch Mobile, Inc. |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Library General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Library General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Library General Public License |
17 | * along with this library; see the file COPYING.LIB. If not, write to |
18 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
19 | * Boston, MA 02110-1301, USA. |
20 | */ |
21 | |
22 | #include "config.h" |
23 | #include <wtf/text/TextBreakIterator.h> |
24 | |
25 | #include <wtf/text/LineBreakIteratorPoolICU.h> |
26 | #include <wtf/text/TextBreakIteratorInternalICU.h> |
27 | #include <wtf/text/icu/UTextProviderLatin1.h> |
28 | #include <wtf/text/icu/UTextProviderUTF16.h> |
29 | #include <atomic> |
30 | #include <mutex> |
31 | #include <unicode/ubrk.h> |
32 | #include <wtf/text/StringBuilder.h> |
33 | |
34 | namespace WTF { |
35 | |
36 | #if !PLATFORM(MAC) && !PLATFORM(IOS_FAMILY) |
37 | |
38 | static Variant<TextBreakIteratorICU, TextBreakIteratorPlatform> mapModeToBackingIterator(StringView string, TextBreakIterator::Mode mode, const AtomString& locale) |
39 | { |
40 | switch (mode) { |
41 | case TextBreakIterator::Mode::Line: |
42 | return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Line, locale.string().utf8().data()); |
43 | case TextBreakIterator::Mode::Caret: |
44 | return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data()); |
45 | case TextBreakIterator::Mode::Delete: |
46 | return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data()); |
47 | default: |
48 | ASSERT_NOT_REACHED(); |
49 | return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data()); |
50 | } |
51 | } |
52 | |
53 | TextBreakIterator::TextBreakIterator(StringView string, Mode mode, const AtomString& locale) |
54 | : m_backing(mapModeToBackingIterator(string, mode, locale)) |
55 | , m_mode(mode) |
56 | , m_locale(locale) |
57 | { |
58 | } |
59 | |
60 | #endif |
61 | |
62 | // Iterator initialization |
63 | |
64 | static UBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID()) |
65 | { |
66 | UErrorCode openStatus = U_ZERO_ERROR; |
67 | UBreakIterator* iterator = ubrk_open(type, locale, 0, 0, &openStatus); |
68 | ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)" , u_errorName(openStatus), openStatus); |
69 | return iterator; |
70 | } |
71 | |
72 | // Iterator text setting |
73 | |
74 | static UBreakIterator* setTextForIterator(UBreakIterator& iterator, StringView string) |
75 | { |
76 | if (string.is8Bit()) { |
77 | UTextWithBuffer textLocal; |
78 | textLocal.text = UTEXT_INITIALIZER; |
79 | textLocal.text.extraSize = sizeof(textLocal.buffer); |
80 | textLocal.text.pExtra = textLocal.buffer; |
81 | |
82 | UErrorCode openStatus = U_ZERO_ERROR; |
83 | UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus); |
84 | if (U_FAILURE(openStatus)) { |
85 | LOG_ERROR("uTextOpenLatin1 failed with status %d" , openStatus); |
86 | return nullptr; |
87 | } |
88 | |
89 | UErrorCode setTextStatus = U_ZERO_ERROR; |
90 | ubrk_setUText(&iterator, text, &setTextStatus); |
91 | if (U_FAILURE(setTextStatus)) { |
92 | LOG_ERROR("ubrk_setUText failed with status %d" , setTextStatus); |
93 | return nullptr; |
94 | } |
95 | |
96 | utext_close(text); |
97 | } else { |
98 | UErrorCode setTextStatus = U_ZERO_ERROR; |
99 | ubrk_setText(&iterator, string.characters16(), string.length(), &setTextStatus); |
100 | if (U_FAILURE(setTextStatus)) |
101 | return nullptr; |
102 | } |
103 | |
104 | return &iterator; |
105 | } |
106 | |
107 | static UBreakIterator* setContextAwareTextForIterator(UBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength) |
108 | { |
109 | if (string.is8Bit()) { |
110 | UTextWithBuffer textLocal; |
111 | textLocal.text = UTEXT_INITIALIZER; |
112 | textLocal.text.extraSize = sizeof(textLocal.buffer); |
113 | textLocal.text.pExtra = textLocal.buffer; |
114 | |
115 | UErrorCode openStatus = U_ZERO_ERROR; |
116 | UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus); |
117 | if (U_FAILURE(openStatus)) { |
118 | LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d" , openStatus); |
119 | return nullptr; |
120 | } |
121 | |
122 | UErrorCode setTextStatus = U_ZERO_ERROR; |
123 | ubrk_setUText(&iterator, text, &setTextStatus); |
124 | if (U_FAILURE(setTextStatus)) { |
125 | LOG_ERROR("ubrk_setUText failed with status %d" , setTextStatus); |
126 | return nullptr; |
127 | } |
128 | |
129 | utext_close(text); |
130 | } else { |
131 | UText textLocal = UTEXT_INITIALIZER; |
132 | |
133 | UErrorCode openStatus = U_ZERO_ERROR; |
134 | UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus); |
135 | if (U_FAILURE(openStatus)) { |
136 | LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d" , openStatus); |
137 | return 0; |
138 | } |
139 | |
140 | UErrorCode setTextStatus = U_ZERO_ERROR; |
141 | ubrk_setUText(&iterator, text, &setTextStatus); |
142 | if (U_FAILURE(setTextStatus)) { |
143 | LOG_ERROR("ubrk_setUText failed with status %d" , setTextStatus); |
144 | return nullptr; |
145 | } |
146 | |
147 | utext_close(text); |
148 | } |
149 | |
150 | return &iterator; |
151 | } |
152 | |
153 | |
154 | // Static iterators |
155 | |
156 | UBreakIterator* wordBreakIterator(StringView string) |
157 | { |
158 | static UBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD); |
159 | if (!staticWordBreakIterator) |
160 | return nullptr; |
161 | |
162 | return setTextForIterator(*staticWordBreakIterator, string); |
163 | } |
164 | |
165 | UBreakIterator* sentenceBreakIterator(StringView string) |
166 | { |
167 | static UBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE); |
168 | if (!staticSentenceBreakIterator) |
169 | return nullptr; |
170 | |
171 | return setTextForIterator(*staticSentenceBreakIterator, string); |
172 | } |
173 | |
174 | UBreakIterator* acquireLineBreakIterator(StringView string, const AtomString& locale, const UChar* priorContext, unsigned priorContextLength, LineBreakIteratorMode mode) |
175 | { |
176 | UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale, mode); |
177 | if (!iterator) |
178 | return nullptr; |
179 | |
180 | return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength); |
181 | } |
182 | |
183 | void releaseLineBreakIterator(UBreakIterator* iterator) |
184 | { |
185 | ASSERT_ARG(iterator, iterator); |
186 | |
187 | LineBreakIteratorPool::sharedPool().put(iterator); |
188 | } |
189 | |
190 | UBreakIterator* openLineBreakIterator(const AtomString& locale) |
191 | { |
192 | bool localeIsEmpty = locale.isEmpty(); |
193 | UErrorCode openStatus = U_ZERO_ERROR; |
194 | UBreakIterator* ubrkIter = ubrk_open(UBRK_LINE, localeIsEmpty ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus); |
195 | // locale comes from a web page and it can be invalid, leading ICU |
196 | // to fail, in which case we fall back to the default locale. |
197 | if (!localeIsEmpty && U_FAILURE(openStatus)) { |
198 | openStatus = U_ZERO_ERROR; |
199 | ubrkIter = ubrk_open(UBRK_LINE, currentTextBreakLocaleID(), 0, 0, &openStatus); |
200 | } |
201 | |
202 | if (U_FAILURE(openStatus)) { |
203 | LOG_ERROR("ubrk_open failed with status %d" , openStatus); |
204 | return nullptr; |
205 | } |
206 | |
207 | return ubrkIter; |
208 | } |
209 | |
210 | void closeLineBreakIterator(UBreakIterator*& iterator) |
211 | { |
212 | UBreakIterator* ubrkIter = iterator; |
213 | ASSERT(ubrkIter); |
214 | ubrk_close(ubrkIter); |
215 | iterator = nullptr; |
216 | } |
217 | |
218 | static std::atomic<UBreakIterator*> nonSharedCharacterBreakIterator = ATOMIC_VAR_INIT(nullptr); |
219 | |
220 | static inline UBreakIterator* getNonSharedCharacterBreakIterator() |
221 | { |
222 | if (auto *res = nonSharedCharacterBreakIterator.exchange(nullptr, std::memory_order_acquire)) |
223 | return res; |
224 | return initializeIterator(UBRK_CHARACTER); |
225 | } |
226 | |
227 | static inline void cacheNonSharedCharacterBreakIterator(UBreakIterator* cacheMe) |
228 | { |
229 | if (auto *old = nonSharedCharacterBreakIterator.exchange(cacheMe, std::memory_order_release)) |
230 | ubrk_close(old); |
231 | } |
232 | |
233 | NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string) |
234 | { |
235 | if ((m_iterator = getNonSharedCharacterBreakIterator())) |
236 | m_iterator = setTextForIterator(*m_iterator, string); |
237 | } |
238 | |
239 | NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() |
240 | { |
241 | if (m_iterator) |
242 | cacheNonSharedCharacterBreakIterator(m_iterator); |
243 | } |
244 | |
245 | NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(NonSharedCharacterBreakIterator&& other) |
246 | : m_iterator(nullptr) |
247 | { |
248 | std::swap(m_iterator, other.m_iterator); |
249 | } |
250 | |
251 | // Iterator implemenation. |
252 | |
253 | bool isWordTextBreak(UBreakIterator* iterator) |
254 | { |
255 | int ruleStatus = ubrk_getRuleStatus(iterator); |
256 | return ruleStatus != UBRK_WORD_NONE; |
257 | } |
258 | |
259 | unsigned numGraphemeClusters(StringView string) |
260 | { |
261 | unsigned stringLength = string.length(); |
262 | |
263 | if (!stringLength) |
264 | return 0; |
265 | |
266 | // The only Latin-1 Extended Grapheme Cluster is CRLF. |
267 | if (string.is8Bit()) { |
268 | auto* characters = string.characters8(); |
269 | unsigned numCRLF = 0; |
270 | for (unsigned i = 1; i < stringLength; ++i) |
271 | numCRLF += characters[i - 1] == '\r' && characters[i] == '\n'; |
272 | return stringLength - numCRLF; |
273 | } |
274 | |
275 | NonSharedCharacterBreakIterator iterator { string }; |
276 | if (!iterator) { |
277 | ASSERT_NOT_REACHED(); |
278 | return stringLength; |
279 | } |
280 | |
281 | unsigned numGraphemeClusters = 0; |
282 | while (ubrk_next(iterator) != UBRK_DONE) |
283 | ++numGraphemeClusters; |
284 | return numGraphemeClusters; |
285 | } |
286 | |
287 | unsigned numCodeUnitsInGraphemeClusters(StringView string, unsigned numGraphemeClusters) |
288 | { |
289 | unsigned stringLength = string.length(); |
290 | |
291 | if (stringLength <= numGraphemeClusters) |
292 | return stringLength; |
293 | |
294 | // The only Latin-1 Extended Grapheme Cluster is CRLF. |
295 | if (string.is8Bit()) { |
296 | auto* characters = string.characters8(); |
297 | unsigned i, j; |
298 | for (i = 0, j = 0; i < numGraphemeClusters && j + 1 < stringLength; ++i, ++j) |
299 | j += characters[j] == '\r' && characters[j + 1] == '\n'; |
300 | return j + (i < numGraphemeClusters); |
301 | } |
302 | |
303 | NonSharedCharacterBreakIterator iterator { string }; |
304 | if (!iterator) { |
305 | ASSERT_NOT_REACHED(); |
306 | return stringLength; |
307 | } |
308 | |
309 | for (unsigned i = 0; i < numGraphemeClusters; ++i) { |
310 | if (ubrk_next(iterator) == UBRK_DONE) |
311 | return stringLength; |
312 | } |
313 | return ubrk_current(iterator); |
314 | } |
315 | |
316 | } // namespace WTF |
317 | |