1/*
2 * Copyright (C) 2006 Lars Knoll <[email protected]>
3 * Copyright (C) 2007-2016 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22#pragma once
23
24#include <wtf/NeverDestroyed.h>
25#include <wtf/Variant.h>
26#include <wtf/text/StringView.h>
27#include <wtf/text/icu/TextBreakIteratorICU.h>
28
29#if PLATFORM(MAC) || PLATFORM(IOS_FAMILY)
30#include <wtf/text/cf/TextBreakIteratorCF.h>
31#else
32#include <wtf/text/NullTextBreakIterator.h>
33#endif
34
35namespace WTF {
36
37#if PLATFORM(MAC) || PLATFORM(IOS_FAMILY)
38typedef TextBreakIteratorCF TextBreakIteratorPlatform;
39#else
40typedef NullTextBreakIterator TextBreakIteratorPlatform;
41#endif
42
43class TextBreakIteratorCache;
44
45class TextBreakIterator {
46public:
47 enum class Mode {
48 Line,
49 Caret,
50 Delete
51 };
52
53 TextBreakIterator() = delete;
54 TextBreakIterator(const TextBreakIterator&) = delete;
55 TextBreakIterator(TextBreakIterator&&) = default;
56 TextBreakIterator& operator=(const TextBreakIterator&) = delete;
57 TextBreakIterator& operator=(TextBreakIterator&&) = default;
58
59 Optional<unsigned> preceding(unsigned location) const
60 {
61 return switchOn(m_backing, [&](const auto& iterator) {
62 return iterator.preceding(location);
63 });
64 }
65
66 Optional<unsigned> following(unsigned location) const
67 {
68 return switchOn(m_backing, [&](const auto& iterator) {
69 return iterator.following(location);
70 });
71 }
72
73 bool isBoundary(unsigned location) const
74 {
75 return switchOn(m_backing, [&](const auto& iterator) {
76 return iterator.isBoundary(location);
77 });
78 }
79
80private:
81 friend class TextBreakIteratorCache;
82
83 // Use CachedTextBreakIterator instead of constructing one of these directly.
84 WTF_EXPORT TextBreakIterator(StringView, Mode, const AtomString& locale);
85
86 void setText(StringView string)
87 {
88 return switchOn(m_backing, [&](auto& iterator) {
89 return iterator.setText(string);
90 });
91 }
92
93 Mode mode() const
94 {
95 return m_mode;
96 }
97
98 const AtomString& locale() const
99 {
100 return m_locale;
101 }
102
103 Variant<TextBreakIteratorICU, TextBreakIteratorPlatform> m_backing;
104 Mode m_mode;
105 AtomString m_locale;
106};
107
108class CachedTextBreakIterator;
109
110class TextBreakIteratorCache {
111// Use CachedTextBreakIterator instead of dealing with the cache directly.
112private:
113 friend class NeverDestroyed<TextBreakIteratorCache>;
114 friend class CachedTextBreakIterator;
115
116 static TextBreakIteratorCache& singleton()
117 {
118 static NeverDestroyed<TextBreakIteratorCache> cache;
119 return cache.get();
120 }
121
122 TextBreakIteratorCache(const TextBreakIteratorCache&) = delete;
123 TextBreakIteratorCache(TextBreakIteratorCache&&) = delete;
124 TextBreakIteratorCache& operator=(const TextBreakIteratorCache&) = delete;
125 TextBreakIteratorCache& operator=(TextBreakIteratorCache&&) = delete;
126
127 TextBreakIterator take(StringView string, TextBreakIterator::Mode mode, const AtomString& locale)
128 {
129 auto iter = std::find_if(m_unused.begin(), m_unused.end(), [&](TextBreakIterator& candidate) {
130 return candidate.mode() == mode && candidate.locale() == locale;
131 });
132 if (iter == m_unused.end())
133 return TextBreakIterator(string, mode, locale);
134 auto result = WTFMove(*iter);
135 m_unused.remove(iter - m_unused.begin());
136 result.setText(string);
137 return result;
138
139 }
140
141 void put(TextBreakIterator&& iterator)
142 {
143 m_unused.append(WTFMove(iterator));
144 if (m_unused.size() > capacity)
145 m_unused.remove(0);
146 }
147
148 TextBreakIteratorCache()
149 {
150 }
151
152 static constexpr int capacity = 2;
153 // FIXME: Break this up into different Vectors per mode.
154 Vector<TextBreakIterator, capacity> m_unused;
155};
156
157// RAII for TextBreakIterator and TextBreakIteratorCache.
158class CachedTextBreakIterator {
159public:
160 CachedTextBreakIterator(StringView string, TextBreakIterator::Mode mode, const AtomString& locale)
161 : m_backing(TextBreakIteratorCache::singleton().take(string, mode, locale))
162 {
163 }
164
165 ~CachedTextBreakIterator()
166 {
167 TextBreakIteratorCache::singleton().put(WTFMove(m_backing));
168 }
169
170 CachedTextBreakIterator() = delete;
171 CachedTextBreakIterator(const CachedTextBreakIterator&) = delete;
172 CachedTextBreakIterator(CachedTextBreakIterator&&) = default;
173 CachedTextBreakIterator& operator=(const CachedTextBreakIterator&) = delete;
174 CachedTextBreakIterator& operator=(CachedTextBreakIterator&&) = default;
175
176 Optional<unsigned> preceding(unsigned location) const
177 {
178 return m_backing.preceding(location);
179 }
180
181 Optional<unsigned> following(unsigned location) const
182 {
183 return m_backing.following(location);
184 }
185
186 bool isBoundary(unsigned location) const
187 {
188 return m_backing.isBoundary(location);
189 }
190
191private:
192 TextBreakIterator m_backing;
193};
194
195// Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
196
197enum class LineBreakIteratorMode { Default, Loose, Normal, Strict };
198
199WTF_EXPORT_PRIVATE UBreakIterator* wordBreakIterator(StringView);
200WTF_EXPORT_PRIVATE UBreakIterator* sentenceBreakIterator(StringView);
201
202WTF_EXPORT_PRIVATE UBreakIterator* acquireLineBreakIterator(StringView, const AtomString& locale, const UChar* priorContext, unsigned priorContextLength, LineBreakIteratorMode);
203WTF_EXPORT_PRIVATE void releaseLineBreakIterator(UBreakIterator*);
204UBreakIterator* openLineBreakIterator(const AtomString& locale);
205void closeLineBreakIterator(UBreakIterator*&);
206
207WTF_EXPORT_PRIVATE bool isWordTextBreak(UBreakIterator*);
208
209class LazyLineBreakIterator {
210public:
211 LazyLineBreakIterator()
212 {
213 resetPriorContext();
214 }
215
216 explicit LazyLineBreakIterator(StringView stringView, const AtomString& locale = AtomString(), LineBreakIteratorMode mode = LineBreakIteratorMode::Default)
217 : m_stringView(stringView)
218 , m_locale(locale)
219 , m_mode(mode)
220 {
221 resetPriorContext();
222 }
223
224 ~LazyLineBreakIterator()
225 {
226 if (m_iterator)
227 releaseLineBreakIterator(m_iterator);
228 }
229
230 StringView stringView() const { return m_stringView; }
231 LineBreakIteratorMode mode() const { return m_mode; }
232
233 UChar lastCharacter() const
234 {
235 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
236 return m_priorContext[1];
237 }
238
239 UChar secondToLastCharacter() const
240 {
241 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
242 return m_priorContext[0];
243 }
244
245 void setPriorContext(UChar last, UChar secondToLast)
246 {
247 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
248 m_priorContext[0] = secondToLast;
249 m_priorContext[1] = last;
250 }
251
252 void updatePriorContext(UChar last)
253 {
254 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
255 m_priorContext[0] = m_priorContext[1];
256 m_priorContext[1] = last;
257 }
258
259 void resetPriorContext()
260 {
261 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
262 m_priorContext[0] = 0;
263 m_priorContext[1] = 0;
264 }
265
266 unsigned priorContextLength() const
267 {
268 unsigned priorContextLength = 0;
269 static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2, "UBreakIterator unexpected prior context length");
270 if (m_priorContext[1]) {
271 ++priorContextLength;
272 if (m_priorContext[0])
273 ++priorContextLength;
274 }
275 return priorContextLength;
276 }
277
278 // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
279 // initialized to use the previously stored string as the primary breaking context and using
280 // previously stored prior context if non-empty.
281 UBreakIterator* get(unsigned priorContextLength)
282 {
283 ASSERT(priorContextLength <= priorContextCapacity);
284 const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
285 if (!m_iterator) {
286 m_iterator = acquireLineBreakIterator(m_stringView, m_locale, priorContext, priorContextLength, m_mode);
287 m_cachedPriorContext = priorContext;
288 m_cachedPriorContextLength = priorContextLength;
289 } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
290 resetStringAndReleaseIterator(m_stringView, m_locale, m_mode);
291 return this->get(priorContextLength);
292 }
293 return m_iterator;
294 }
295
296 void resetStringAndReleaseIterator(StringView stringView, const AtomString& locale, LineBreakIteratorMode mode)
297 {
298 if (m_iterator)
299 releaseLineBreakIterator(m_iterator);
300 m_stringView = stringView;
301 m_locale = locale;
302 m_iterator = nullptr;
303 m_cachedPriorContext = nullptr;
304 m_mode = mode;
305 m_cachedPriorContextLength = 0;
306 }
307
308private:
309 static constexpr unsigned priorContextCapacity = 2;
310 StringView m_stringView;
311 AtomString m_locale;
312 UBreakIterator* m_iterator { nullptr };
313 const UChar* m_cachedPriorContext { nullptr };
314 LineBreakIteratorMode m_mode { LineBreakIteratorMode::Default };
315 unsigned m_cachedPriorContextLength { 0 };
316 UChar m_priorContext[priorContextCapacity];
317};
318
319// Iterates over "extended grapheme clusters", as defined in UAX #29.
320// Note that platform implementations may be less sophisticated - e.g. ICU prior to
321// version 4.0 only supports "legacy grapheme clusters".
322// Use this for general text processing, e.g. string truncation.
323
324class NonSharedCharacterBreakIterator {
325 WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
326public:
327 WTF_EXPORT_PRIVATE NonSharedCharacterBreakIterator(StringView);
328 WTF_EXPORT_PRIVATE ~NonSharedCharacterBreakIterator();
329
330 NonSharedCharacterBreakIterator(NonSharedCharacterBreakIterator&&);
331
332 operator UBreakIterator*() const { return m_iterator; }
333
334private:
335 UBreakIterator* m_iterator;
336};
337
338// Counts the number of grapheme clusters. A surrogate pair or a sequence
339// of a non-combining character and following combining characters is
340// counted as 1 grapheme cluster.
341WTF_EXPORT_PRIVATE unsigned numGraphemeClusters(StringView);
342
343// Returns the number of code units that create the specified number of
344// grapheme clusters. If there are fewer clusters in the string than specified,
345// the length of the string is returned.
346WTF_EXPORT_PRIVATE unsigned numCodeUnitsInGraphemeClusters(StringView, unsigned);
347
348}
349
350using WTF::CachedTextBreakIterator;
351using WTF::LazyLineBreakIterator;
352using WTF::LineBreakIteratorMode;
353using WTF::NonSharedCharacterBreakIterator;
354using WTF::TextBreakIterator;
355using WTF::TextBreakIteratorCache;
356using WTF::isWordTextBreak;
357