1/*
2 * Copyright (C) 2006 Lars Knoll <[email protected]>
3 * Copyright (C) 2007-2016 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22#pragma once
23
24#include <wtf/NeverDestroyed.h>
25#include <wtf/Variant.h>
26#include <wtf/text/StringView.h>
27#include <wtf/text/icu/TextBreakIteratorICU.h>
28
29#if PLATFORM(COCOA)
30#include <wtf/text/cf/TextBreakIteratorCF.h>
31#else
32#include <wtf/text/NullTextBreakIterator.h>
33#endif
34
35namespace WTF {
36
37#if PLATFORM(COCOA)
38typedef TextBreakIteratorCF TextBreakIteratorPlatform;
39#else
40typedef NullTextBreakIterator TextBreakIteratorPlatform;
41#endif
42
43class TextBreakIteratorCache;
44
45class TextBreakIterator {
46 WTF_MAKE_FAST_ALLOCATED;
47public:
48 enum class Mode {
49 Line,
50 Caret,
51 Delete
52 };
53
54 TextBreakIterator() = delete;
55 TextBreakIterator(const TextBreakIterator&) = delete;
56 TextBreakIterator(TextBreakIterator&&) = default;
57 TextBreakIterator& operator=(const TextBreakIterator&) = delete;
58 TextBreakIterator& operator=(TextBreakIterator&&) = default;
59
60 Optional<unsigned> preceding(unsigned location) const
61 {
62 return switchOn(m_backing, [&](const auto& iterator) {
63 return iterator.preceding(location);
64 });
65 }
66
67 Optional<unsigned> following(unsigned location) const
68 {
69 return switchOn(m_backing, [&](const auto& iterator) {
70 return iterator.following(location);
71 });
72 }
73
74 bool isBoundary(unsigned location) const
75 {
76 return switchOn(m_backing, [&](const auto& iterator) {
77 return iterator.isBoundary(location);
78 });
79 }
80
81private:
82 friend class TextBreakIteratorCache;
83
84 // Use CachedTextBreakIterator instead of constructing one of these directly.
85 WTF_EXPORT TextBreakIterator(StringView, Mode, const AtomString& locale);
86
87 void setText(StringView string)
88 {
89 return switchOn(m_backing, [&](auto& iterator) {
90 return iterator.setText(string);
91 });
92 }
93
94 Mode mode() const
95 {
96 return m_mode;
97 }
98
99 const AtomString& locale() const
100 {
101 return m_locale;
102 }
103
104 Variant<TextBreakIteratorICU, TextBreakIteratorPlatform> m_backing;
105 Mode m_mode;
106 AtomString m_locale;
107};
108
109class CachedTextBreakIterator;
110
111class TextBreakIteratorCache {
112 WTF_MAKE_FAST_ALLOCATED;
113// Use CachedTextBreakIterator instead of dealing with the cache directly.
114private:
115 friend class NeverDestroyed<TextBreakIteratorCache>;
116 friend class CachedTextBreakIterator;
117
118 static TextBreakIteratorCache& singleton()
119 {
120 static NeverDestroyed<TextBreakIteratorCache> cache;
121 return cache.get();
122 }
123
124 TextBreakIteratorCache(const TextBreakIteratorCache&) = delete;
125 TextBreakIteratorCache(TextBreakIteratorCache&&) = delete;
126 TextBreakIteratorCache& operator=(const TextBreakIteratorCache&) = delete;
127 TextBreakIteratorCache& operator=(TextBreakIteratorCache&&) = delete;
128
129 TextBreakIterator take(StringView string, TextBreakIterator::Mode mode, const AtomString& locale)
130 {
131 auto iter = std::find_if(m_unused.begin(), m_unused.end(), [&](TextBreakIterator& candidate) {
132 return candidate.mode() == mode && candidate.locale() == locale;
133 });
134 if (iter == m_unused.end())
135 return TextBreakIterator(string, mode, locale);
136 auto result = WTFMove(*iter);
137 m_unused.remove(iter - m_unused.begin());
138 result.setText(string);
139 return result;
140
141 }
142
143 void put(TextBreakIterator&& iterator)
144 {
145 m_unused.append(WTFMove(iterator));
146 if (m_unused.size() > capacity)
147 m_unused.remove(0);
148 }
149
150 TextBreakIteratorCache()
151 {
152 }
153
154 static constexpr int capacity = 2;
155 // FIXME: Break this up into different Vectors per mode.
156 Vector<TextBreakIterator, capacity> m_unused;
157};
158
159// RAII for TextBreakIterator and TextBreakIteratorCache.
160class CachedTextBreakIterator {
161 WTF_MAKE_FAST_ALLOCATED;
162public:
163 CachedTextBreakIterator(StringView string, TextBreakIterator::Mode mode, const AtomString& locale)
164 : m_backing(TextBreakIteratorCache::singleton().take(string, mode, locale))
165 {
166 }
167
168 ~CachedTextBreakIterator()
169 {
170 TextBreakIteratorCache::singleton().put(WTFMove(m_backing));
171 }
172
173 CachedTextBreakIterator() = delete;
174 CachedTextBreakIterator(const CachedTextBreakIterator&) = delete;
175 CachedTextBreakIterator(CachedTextBreakIterator&&) = default;
176 CachedTextBreakIterator& operator=(const CachedTextBreakIterator&) = delete;
177 CachedTextBreakIterator& operator=(CachedTextBreakIterator&&) = default;
178
179 Optional<unsigned> preceding(unsigned location) const
180 {
181 return m_backing.preceding(location);
182 }
183
184 Optional<unsigned> following(unsigned location) const
185 {
186 return m_backing.following(location);
187 }
188
189 bool isBoundary(unsigned location) const
190 {
191 return m_backing.isBoundary(location);
192 }
193
194private:
195 TextBreakIterator m_backing;
196};
197
198// Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
199
200enum class LineBreakIteratorMode { Default, Loose, Normal, Strict };
201
202WTF_EXPORT_PRIVATE UBreakIterator* wordBreakIterator(StringView);
203WTF_EXPORT_PRIVATE UBreakIterator* sentenceBreakIterator(StringView);
204
205WTF_EXPORT_PRIVATE UBreakIterator* acquireLineBreakIterator(StringView, const AtomString& locale, const UChar* priorContext, unsigned priorContextLength, LineBreakIteratorMode);
206WTF_EXPORT_PRIVATE void releaseLineBreakIterator(UBreakIterator*);
207UBreakIterator* openLineBreakIterator(const AtomString& locale);
208void closeLineBreakIterator(UBreakIterator*&);
209
210WTF_EXPORT_PRIVATE bool isWordTextBreak(UBreakIterator*);
211
212class LazyLineBreakIterator {
213 WTF_MAKE_FAST_ALLOCATED;
214public:
215 LazyLineBreakIterator()
216 {
217 resetPriorContext();
218 }
219
220 explicit LazyLineBreakIterator(StringView stringView, const AtomString& locale = AtomString(), LineBreakIteratorMode mode = LineBreakIteratorMode::Default)
221 : m_stringView(stringView)
222 , m_locale(locale)
223 , m_mode(mode)
224 {
225 resetPriorContext();
226 }
227
228 ~LazyLineBreakIterator()
229 {
230 if (m_iterator)
231 releaseLineBreakIterator(m_iterator);
232 }
233
234 StringView stringView() const { return m_stringView; }
235 LineBreakIteratorMode mode() const { return m_mode; }
236
237 UChar lastCharacter() const
238 {
239 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
240 return m_priorContext[1];
241 }
242
243 UChar secondToLastCharacter() const
244 {
245 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
246 return m_priorContext[0];
247 }
248
249 void setPriorContext(UChar last, UChar secondToLast)
250 {
251 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
252 m_priorContext[0] = secondToLast;
253 m_priorContext[1] = last;
254 }
255
256 void updatePriorContext(UChar last)
257 {
258 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
259 m_priorContext[0] = m_priorContext[1];
260 m_priorContext[1] = last;
261 }
262
263 void resetPriorContext()
264 {
265 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
266 m_priorContext[0] = 0;
267 m_priorContext[1] = 0;
268 }
269
270 unsigned priorContextLength() const
271 {
272 unsigned priorContextLength = 0;
273 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
274 if (m_priorContext[1]) {
275 ++priorContextLength;
276 if (m_priorContext[0])
277 ++priorContextLength;
278 }
279 return priorContextLength;
280 }
281
282 // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
283 // initialized to use the previously stored string as the primary breaking context and using
284 // previously stored prior context if non-empty.
285 UBreakIterator* get(unsigned priorContextLength)
286 {
287 ASSERT(priorContextLength <= priorContextCapacity);
288 const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
289 if (!m_iterator) {
290 m_iterator = acquireLineBreakIterator(m_stringView, m_locale, priorContext, priorContextLength, m_mode);
291 m_cachedPriorContext = priorContext;
292 m_cachedPriorContextLength = priorContextLength;
293 } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
294 resetStringAndReleaseIterator(m_stringView, m_locale, m_mode);
295 return this->get(priorContextLength);
296 }
297 return m_iterator;
298 }
299
300 void resetStringAndReleaseIterator(StringView stringView, const AtomString& locale, LineBreakIteratorMode mode)
301 {
302 if (m_iterator)
303 releaseLineBreakIterator(m_iterator);
304 m_stringView = stringView;
305 m_locale = locale;
306 m_iterator = nullptr;
307 m_cachedPriorContext = nullptr;
308 m_mode = mode;
309 m_cachedPriorContextLength = 0;
310 }
311
312private:
313 static constexpr unsigned priorContextCapacity = 2;
314 StringView m_stringView;
315 AtomString m_locale;
316 UBreakIterator* m_iterator { nullptr };
317 const UChar* m_cachedPriorContext { nullptr };
318 LineBreakIteratorMode m_mode { LineBreakIteratorMode::Default };
319 unsigned m_cachedPriorContextLength { 0 };
320 UChar m_priorContext[priorContextCapacity];
321};
322
323// Iterates over "extended grapheme clusters", as defined in UAX #29.
324// Note that platform implementations may be less sophisticated - e.g. ICU prior to
325// version 4.0 only supports "legacy grapheme clusters".
326// Use this for general text processing, e.g. string truncation.
327
328class NonSharedCharacterBreakIterator {
329 WTF_MAKE_FAST_ALLOCATED;
330 WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
331public:
332 WTF_EXPORT_PRIVATE NonSharedCharacterBreakIterator(StringView);
333 WTF_EXPORT_PRIVATE ~NonSharedCharacterBreakIterator();
334
335 NonSharedCharacterBreakIterator(NonSharedCharacterBreakIterator&&);
336
337 operator UBreakIterator*() const { return m_iterator; }
338
339private:
340 UBreakIterator* m_iterator;
341};
342
343// Counts the number of grapheme clusters. A surrogate pair or a sequence
344// of a non-combining character and following combining characters is
345// counted as 1 grapheme cluster.
346WTF_EXPORT_PRIVATE unsigned numGraphemeClusters(StringView);
347
348// Returns the number of code units that create the specified number of
349// grapheme clusters. If there are fewer clusters in the string than specified,
350// the length of the string is returned.
351WTF_EXPORT_PRIVATE unsigned numCodeUnitsInGraphemeClusters(StringView, unsigned);
352
353}
354
355using WTF::CachedTextBreakIterator;
356using WTF::LazyLineBreakIterator;
357using WTF::LineBreakIteratorMode;
358using WTF::NonSharedCharacterBreakIterator;
359using WTF::TextBreakIterator;
360using WTF::TextBreakIteratorCache;
361using WTF::isWordTextBreak;
362