1/*
2 * Copyright (C) 1999 Lars Knoll ([email protected])
3 * (C) 1999 Antti Koivisto ([email protected])
4 * (C) 2001 Dirk Mueller ( [email protected] )
5 * Copyright (C) 2003-2018 Apple Inc. All rights reserved.
6 * Copyright (C) 2006 Andrew Wellington ([email protected])
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25#include "config.h"
26#include <wtf/text/StringImpl.h>
27
28#include <wtf/ProcessID.h>
29#include <wtf/StdLibExtras.h>
30#include <wtf/text/AtomString.h>
31#include <wtf/text/CString.h>
32#include <wtf/text/ExternalStringImpl.h>
33#include <wtf/text/StringBuffer.h>
34#include <wtf/text/StringHash.h>
35#include <wtf/text/StringView.h>
36#include <wtf/text/SymbolImpl.h>
37#include <wtf/text/SymbolRegistry.h>
38#include <wtf/unicode/CharacterNames.h>
39#include <wtf/unicode/UTF8Conversion.h>
40
41#if STRING_STATS
42#include <unistd.h>
43#include <wtf/DataLog.h>
44#endif
45
46namespace WTF {
47
48using namespace Unicode;
49
50static_assert(sizeof(StringImpl) == 2 * sizeof(int) + 2 * sizeof(void*), "StringImpl should stay small");
51
52#if STRING_STATS
53StringStats StringImpl::m_stringStats;
54
55std::atomic<unsigned> StringStats::s_stringRemovesTillPrintStats(s_printStringStatsFrequency);
56
57void StringStats::removeString(StringImpl& string)
58{
59 unsigned length = string.length();
60 bool isSubString = string.isSubString();
61
62 --m_totalNumberStrings;
63
64 if (string.is8Bit()) {
65 --m_number8BitStrings;
66 if (!isSubString)
67 m_total8BitData -= length;
68 } else {
69 --m_number16BitStrings;
70 if (!isSubString)
71 m_total16BitData -= length;
72 }
73
74 if (!--s_stringRemovesTillPrintStats) {
75 s_stringRemovesTillPrintStats = s_printStringStatsFrequency;
76 printStats();
77 }
78}
79
80void StringStats::printStats()
81{
82 dataLogF("String stats for process id %d:\n", getCurrentProcessID());
83
84 unsigned long long totalNumberCharacters = m_total8BitData + m_total16BitData;
85 double percent8Bit = m_totalNumberStrings ? ((double)m_number8BitStrings * 100) / (double)m_totalNumberStrings : 0.0;
86 double average8bitLength = m_number8BitStrings ? (double)m_total8BitData / (double)m_number8BitStrings : 0.0;
87 dataLogF("%8u (%5.2f%%) 8 bit %12llu chars %12llu bytes avg length %6.1f\n", m_number8BitStrings.load(), percent8Bit, m_total8BitData.load(), m_total8BitData.load(), average8bitLength);
88
89 double percent16Bit = m_totalNumberStrings ? ((double)m_number16BitStrings * 100) / (double)m_totalNumberStrings : 0.0;
90 double average16bitLength = m_number16BitStrings ? (double)m_total16BitData / (double)m_number16BitStrings : 0.0;
91 dataLogF("%8u (%5.2f%%) 16 bit %12llu chars %12llu bytes avg length %6.1f\n", m_number16BitStrings.load(), percent16Bit, m_total16BitData.load(), m_total16BitData * 2, average16bitLength);
92
93 double averageLength = m_totalNumberStrings ? (double)totalNumberCharacters / (double)m_totalNumberStrings : 0.0;
94 unsigned long long totalDataBytes = m_total8BitData + m_total16BitData * 2;
95 dataLogF("%8u Total %12llu chars %12llu bytes avg length %6.1f\n", m_totalNumberStrings.load(), totalNumberCharacters, totalDataBytes, averageLength);
96 unsigned long long totalSavedBytes = m_total8BitData;
97 double percentSavings = totalSavedBytes ? ((double)totalSavedBytes * 100) / (double)(totalDataBytes + totalSavedBytes) : 0.0;
98 dataLogF(" Total savings %12llu bytes (%5.2f%%)\n", totalSavedBytes, percentSavings);
99
100 dataLogF("%8u StringImpl::ref calls\n", m_refCalls.load());
101 dataLogF("%8u StringImpl::deref calls\n", m_derefCalls.load());
102}
103#endif
104
105StringImpl::StaticStringImpl StringImpl::s_emptyAtomString("", StringImpl::StringAtom);
106
107StringImpl::~StringImpl()
108{
109 ASSERT(!isStatic());
110
111 StringView::invalidate(*this);
112
113 STRING_STATS_REMOVE_STRING(*this);
114
115 if (isAtom()) {
116 ASSERT(!isSymbol());
117 if (length())
118 AtomStringImpl::remove(static_cast<AtomStringImpl*>(this));
119 } else if (isSymbol()) {
120 auto& symbol = static_cast<SymbolImpl&>(*this);
121 auto* symbolRegistry = symbol.symbolRegistry();
122 if (symbolRegistry)
123 symbolRegistry->remove(*symbol.asRegisteredSymbolImpl());
124 }
125
126 BufferOwnership ownership = bufferOwnership();
127
128 if (ownership == BufferInternal)
129 return;
130 if (ownership == BufferOwned) {
131 // We use m_data8, but since it is a union with m_data16 this works either way.
132 ASSERT(m_data8);
133 fastFree(const_cast<LChar*>(m_data8));
134 return;
135 }
136 if (ownership == BufferExternal) {
137 auto* external = static_cast<ExternalStringImpl*>(this);
138 external->freeExternalBuffer(const_cast<LChar*>(m_data8), sizeInBytes());
139 external->m_free.~ExternalStringImplFreeFunction();
140 return;
141 }
142
143 ASSERT(ownership == BufferSubstring);
144 ASSERT(substringBuffer());
145 substringBuffer()->deref();
146}
147
148void StringImpl::destroy(StringImpl* stringImpl)
149{
150 stringImpl->~StringImpl();
151 fastFree(stringImpl);
152}
153
154Ref<StringImpl> StringImpl::createFromLiteral(const char* characters, unsigned length)
155{
156 ASSERT_WITH_MESSAGE(length, "Use StringImpl::empty() to create an empty string");
157 ASSERT(charactersAreAllASCII<LChar>(reinterpret_cast<const LChar*>(characters), length));
158 return adoptRef(*new StringImpl(reinterpret_cast<const LChar*>(characters), length, ConstructWithoutCopying));
159}
160
161Ref<StringImpl> StringImpl::createFromLiteral(const char* characters)
162{
163 return createFromLiteral(characters, strlen(characters));
164}
165
166Ref<StringImpl> StringImpl::createWithoutCopying(const UChar* characters, unsigned length)
167{
168 if (!length)
169 return *empty();
170 return adoptRef(*new StringImpl(characters, length, ConstructWithoutCopying));
171}
172
173Ref<StringImpl> StringImpl::createWithoutCopying(const LChar* characters, unsigned length)
174{
175 if (!length)
176 return *empty();
177 return adoptRef(*new StringImpl(characters, length, ConstructWithoutCopying));
178}
179
180template<typename CharacterType> inline Ref<StringImpl> StringImpl::createUninitializedInternal(unsigned length, CharacterType*& data)
181{
182 if (!length) {
183 data = 0;
184 return *empty();
185 }
186 return createUninitializedInternalNonEmpty(length, data);
187}
188
189template<typename CharacterType> inline Ref<StringImpl> StringImpl::createUninitializedInternalNonEmpty(unsigned length, CharacterType*& data)
190{
191 ASSERT(length);
192
193 // Allocate a single buffer large enough to contain the StringImpl
194 // struct as well as the data which it contains. This removes one
195 // heap allocation from this call.
196 if (length > maxInternalLength<CharacterType>())
197 CRASH();
198 StringImpl* string = static_cast<StringImpl*>(fastMalloc(allocationSize<CharacterType>(length)));
199
200 data = string->tailPointer<CharacterType>();
201 return constructInternal<CharacterType>(*string, length);
202}
203
204Ref<StringImpl> StringImpl::createUninitialized(unsigned length, LChar*& data)
205{
206 return createUninitializedInternal(length, data);
207}
208
209Ref<StringImpl> StringImpl::createUninitialized(unsigned length, UChar*& data)
210{
211 return createUninitializedInternal(length, data);
212}
213
214template<typename CharacterType> inline Expected<Ref<StringImpl>, UTF8ConversionError> StringImpl::reallocateInternal(Ref<StringImpl>&& originalString, unsigned length, CharacterType*& data)
215{
216 ASSERT(originalString->hasOneRef());
217 ASSERT(originalString->bufferOwnership() == BufferInternal);
218
219 if (!length) {
220 data = 0;
221 return Ref<StringImpl>(*empty());
222 }
223
224 // Same as createUninitialized() except here we use fastRealloc.
225 if (length > maxInternalLength<CharacterType>())
226 return makeUnexpected(UTF8ConversionError::OutOfMemory);
227
228 originalString->~StringImpl();
229 StringImpl* string;
230 if (!tryFastRealloc(&originalString.leakRef(), allocationSize<CharacterType>(length)).getValue(string))
231 return makeUnexpected(UTF8ConversionError::OutOfMemory);
232
233 data = string->tailPointer<CharacterType>();
234 return constructInternal<CharacterType>(*string, length);
235}
236
237Ref<StringImpl> StringImpl::reallocate(Ref<StringImpl>&& originalString, unsigned length, LChar*& data)
238{
239 auto expectedStringImpl = tryReallocate(WTFMove(originalString), length, data);
240 RELEASE_ASSERT(expectedStringImpl);
241 return WTFMove(expectedStringImpl.value());
242}
243
244Ref<StringImpl> StringImpl::reallocate(Ref<StringImpl>&& originalString, unsigned length, UChar*& data)
245{
246 auto expectedStringImpl = tryReallocate(WTFMove(originalString), length, data);
247 RELEASE_ASSERT(expectedStringImpl);
248 return WTFMove(expectedStringImpl.value());
249}
250
251Expected<Ref<StringImpl>, UTF8ConversionError> StringImpl::tryReallocate(Ref<StringImpl>&& originalString, unsigned length, LChar*& data)
252{
253 ASSERT(originalString->is8Bit());
254 return reallocateInternal(WTFMove(originalString), length, data);
255}
256
257Expected<Ref<StringImpl>, UTF8ConversionError> StringImpl::tryReallocate(Ref<StringImpl>&& originalString, unsigned length, UChar*& data)
258{
259 ASSERT(!originalString->is8Bit());
260 return reallocateInternal(WTFMove(originalString), length, data);
261}
262
263template<typename CharacterType> inline Ref<StringImpl> StringImpl::createInternal(const CharacterType* characters, unsigned length)
264{
265 if (!characters || !length)
266 return *empty();
267 CharacterType* data;
268 auto string = createUninitializedInternalNonEmpty(length, data);
269 copyCharacters(data, characters, length);
270 return string;
271}
272
273Ref<StringImpl> StringImpl::create(const UChar* characters, unsigned length)
274{
275 return createInternal(characters, length);
276}
277
278Ref<StringImpl> StringImpl::create(const LChar* characters, unsigned length)
279{
280 return createInternal(characters, length);
281}
282
283Ref<StringImpl> StringImpl::create8BitIfPossible(const UChar* characters, unsigned length)
284{
285 if (!characters || !length)
286 return *empty();
287
288 LChar* data;
289 auto string = createUninitializedInternalNonEmpty(length, data);
290
291 for (size_t i = 0; i < length; ++i) {
292 if (!isLatin1(characters[i]))
293 return create(characters, length);
294 data[i] = static_cast<LChar>(characters[i]);
295 }
296
297 return string;
298}
299
300Ref<StringImpl> StringImpl::create8BitIfPossible(const UChar* string)
301{
302 return StringImpl::create8BitIfPossible(string, lengthOfNullTerminatedString(string));
303}
304
305Ref<StringImpl> StringImpl::create(const LChar* string)
306{
307 if (!string)
308 return *empty();
309 size_t length = strlen(reinterpret_cast<const char*>(string));
310 if (length > MaxLength)
311 CRASH();
312 return create(string, length);
313}
314
315Ref<StringImpl> StringImpl::substring(unsigned start, unsigned length)
316{
317 if (start >= m_length)
318 return *empty();
319 unsigned maxLength = m_length - start;
320 if (length >= maxLength) {
321 if (!start)
322 return *this;
323 length = maxLength;
324 }
325 if (is8Bit())
326 return create(m_data8 + start, length);
327
328 return create(m_data16 + start, length);
329}
330
331UChar32 StringImpl::characterStartingAt(unsigned i)
332{
333 if (is8Bit())
334 return m_data8[i];
335 if (U16_IS_SINGLE(m_data16[i]))
336 return m_data16[i];
337 if (i + 1 < m_length && U16_IS_LEAD(m_data16[i]) && U16_IS_TRAIL(m_data16[i + 1]))
338 return U16_GET_SUPPLEMENTARY(m_data16[i], m_data16[i + 1]);
339 return 0;
340}
341
342Ref<StringImpl> StringImpl::convertToLowercaseWithoutLocale()
343{
344 // Note: At one time this was a hot function in the Dromaeo benchmark, specifically the
345 // no-op code path that may return ourself if we find no upper case letters and no invalid
346 // ASCII letters.
347
348 // First scan the string for uppercase and non-ASCII characters:
349 if (is8Bit()) {
350 for (unsigned i = 0; i < m_length; ++i) {
351 LChar character = m_data8[i];
352 if (UNLIKELY((character & ~0x7F) || isASCIIUpper(character)))
353 return convertToLowercaseWithoutLocaleStartingAtFailingIndex8Bit(i);
354 }
355
356 return *this;
357 }
358
359 bool noUpper = true;
360 unsigned ored = 0;
361
362 for (unsigned i = 0; i < m_length; ++i) {
363 UChar character = m_data16[i];
364 if (UNLIKELY(isASCIIUpper(character)))
365 noUpper = false;
366 ored |= character;
367 }
368 // Nothing to do if the string is all ASCII with no uppercase.
369 if (noUpper && !(ored & ~0x7F))
370 return *this;
371
372 if (!(ored & ~0x7F)) {
373 UChar* data16;
374 auto newImpl = createUninitializedInternalNonEmpty(m_length, data16);
375 for (unsigned i = 0; i < m_length; ++i)
376 data16[i] = toASCIILower(m_data16[i]);
377 return newImpl;
378 }
379
380 if (m_length > MaxLength)
381 CRASH();
382 int32_t length = m_length;
383
384 // Do a slower implementation for cases that include non-ASCII characters.
385 UChar* data16;
386 auto newImpl = createUninitializedInternalNonEmpty(m_length, data16);
387
388 UErrorCode status = U_ZERO_ERROR;
389 int32_t realLength = u_strToLower(data16, length, m_data16, m_length, "", &status);
390 if (U_SUCCESS(status) && realLength == length)
391 return newImpl;
392
393 newImpl = createUninitialized(realLength, data16);
394 status = U_ZERO_ERROR;
395 u_strToLower(data16, realLength, m_data16, m_length, "", &status);
396 if (U_FAILURE(status))
397 return *this;
398 return newImpl;
399}
400
401Ref<StringImpl> StringImpl::convertToLowercaseWithoutLocaleStartingAtFailingIndex8Bit(unsigned failingIndex)
402{
403 ASSERT(is8Bit());
404 LChar* data8;
405 auto newImpl = createUninitializedInternalNonEmpty(m_length, data8);
406
407 for (unsigned i = 0; i < failingIndex; ++i) {
408 ASSERT(!(m_data8[i] & ~0x7F) && !isASCIIUpper(m_data8[i]));
409 data8[i] = m_data8[i];
410 }
411
412 for (unsigned i = failingIndex; i < m_length; ++i) {
413 LChar character = m_data8[i];
414 if (!(character & ~0x7F))
415 data8[i] = toASCIILower(character);
416 else {
417 ASSERT(isLatin1(u_tolower(character)));
418 data8[i] = static_cast<LChar>(u_tolower(character));
419 }
420 }
421
422 return newImpl;
423}
424
425Ref<StringImpl> StringImpl::convertToUppercaseWithoutLocale()
426{
427 // This function could be optimized for no-op cases the way
428 // convertToLowercaseWithoutLocale() is, but in empirical testing,
429 // few actual calls to upper() are no-ops, so it wouldn't be worth
430 // the extra time for pre-scanning.
431
432 if (m_length > MaxLength)
433 CRASH();
434 int32_t length = m_length;
435
436 if (is8Bit()) {
437 LChar* data8;
438 auto newImpl = createUninitialized(m_length, data8);
439
440 // Do a faster loop for the case where all the characters are ASCII.
441 unsigned ored = 0;
442 for (int i = 0; i < length; ++i) {
443 LChar character = m_data8[i];
444 ored |= character;
445 data8[i] = toASCIIUpper(character);
446 }
447 if (!(ored & ~0x7F))
448 return newImpl;
449
450 // Do a slower implementation for cases that include non-ASCII Latin-1 characters.
451 int numberSharpSCharacters = 0;
452
453 // There are two special cases.
454 // 1. Some Latin-1 characters when converted to upper case are 16 bit characters.
455 // 2. Lower case sharp-S converts to "SS" (two characters)
456 for (int32_t i = 0; i < length; ++i) {
457 LChar character = m_data8[i];
458 if (UNLIKELY(character == smallLetterSharpS))
459 ++numberSharpSCharacters;
460 ASSERT(u_toupper(character) <= 0xFFFF);
461 UChar upper = u_toupper(character);
462 if (UNLIKELY(!isLatin1(upper))) {
463 // Since this upper-cased character does not fit in an 8-bit string, we need to take the 16-bit path.
464 goto upconvert;
465 }
466 data8[i] = static_cast<LChar>(upper);
467 }
468
469 if (!numberSharpSCharacters)
470 return newImpl;
471
472 // We have numberSSCharacters sharp-s characters, but none of the other special characters.
473 newImpl = createUninitialized(m_length + numberSharpSCharacters, data8);
474
475 LChar* dest = data8;
476
477 for (int32_t i = 0; i < length; ++i) {
478 LChar character = m_data8[i];
479 if (character == smallLetterSharpS) {
480 *dest++ = 'S';
481 *dest++ = 'S';
482 } else {
483 ASSERT(isLatin1(u_toupper(character)));
484 *dest++ = static_cast<LChar>(u_toupper(character));
485 }
486 }
487
488 return newImpl;
489 }
490
491upconvert:
492 auto upconvertedCharacters = StringView(*this).upconvertedCharacters();
493 const UChar* source16 = upconvertedCharacters;
494
495 UChar* data16;
496 auto newImpl = createUninitialized(m_length, data16);
497
498 // Do a faster loop for the case where all the characters are ASCII.
499 unsigned ored = 0;
500 for (int i = 0; i < length; ++i) {
501 UChar character = source16[i];
502 ored |= character;
503 data16[i] = toASCIIUpper(character);
504 }
505 if (!(ored & ~0x7F))
506 return newImpl;
507
508 // Do a slower implementation for cases that include non-ASCII characters.
509 UErrorCode status = U_ZERO_ERROR;
510 int32_t realLength = u_strToUpper(data16, length, source16, m_length, "", &status);
511 if (U_SUCCESS(status) && realLength == length)
512 return newImpl;
513 newImpl = createUninitialized(realLength, data16);
514 status = U_ZERO_ERROR;
515 u_strToUpper(data16, realLength, source16, m_length, "", &status);
516 if (U_FAILURE(status))
517 return *this;
518 return newImpl;
519}
520
521static inline bool needsTurkishCasingRules(const AtomString& localeIdentifier)
522{
523 // Either "tr" or "az" locale, with case sensitive comparison and allowing for an ignored subtag.
524 UChar first = localeIdentifier[0];
525 UChar second = localeIdentifier[1];
526 return ((isASCIIAlphaCaselessEqual(first, 't') && isASCIIAlphaCaselessEqual(second, 'r'))
527 || (isASCIIAlphaCaselessEqual(first, 'a') && isASCIIAlphaCaselessEqual(second, 'z')))
528 && (localeIdentifier.length() == 2 || localeIdentifier[2] == '-');
529}
530
531Ref<StringImpl> StringImpl::convertToLowercaseWithLocale(const AtomString& localeIdentifier)
532{
533 // Use the more-optimized code path most of the time.
534 // Assuming here that the only locale-specific lowercasing is the Turkish casing rules.
535 // FIXME: Could possibly optimize further by looking for the specific sequences
536 // that have locale-specific lowercasing. There are only three of them.
537 if (!needsTurkishCasingRules(localeIdentifier))
538 return convertToLowercaseWithoutLocale();
539
540 // FIXME: Could share more code with the main StringImpl::lower by factoring out
541 // this last part into a shared function that takes a locale string, since this is
542 // just like the end of that function.
543
544 if (m_length > MaxLength)
545 CRASH();
546 int length = m_length;
547
548 // Below, we pass in the hardcoded locale "tr". Passing that is more efficient than
549 // allocating memory just to turn localeIdentifier into a C string, and we assume
550 // there is no difference between the uppercasing for "tr" and "az" locales.
551 auto upconvertedCharacters = StringView(*this).upconvertedCharacters();
552 const UChar* source16 = upconvertedCharacters;
553 UChar* data16;
554 auto newString = createUninitialized(length, data16);
555 UErrorCode status = U_ZERO_ERROR;
556 int realLength = u_strToLower(data16, length, source16, length, "tr", &status);
557 if (U_SUCCESS(status) && realLength == length)
558 return newString;
559 newString = createUninitialized(realLength, data16);
560 status = U_ZERO_ERROR;
561 u_strToLower(data16, realLength, source16, length, "tr", &status);
562 if (U_FAILURE(status))
563 return *this;
564 return newString;
565}
566
567Ref<StringImpl> StringImpl::convertToUppercaseWithLocale(const AtomString& localeIdentifier)
568{
569 // Use the more-optimized code path most of the time.
570 // Assuming here that the only locale-specific lowercasing is the Turkish casing rules,
571 // and that the only affected character is lowercase "i".
572 if (!needsTurkishCasingRules(localeIdentifier) || find('i') == notFound)
573 return convertToUppercaseWithoutLocale();
574
575 if (m_length > MaxLength)
576 CRASH();
577 int length = m_length;
578
579 // Below, we pass in the hardcoded locale "tr". Passing that is more efficient than
580 // allocating memory just to turn localeIdentifier into a C string, and we assume
581 // there is no difference between the uppercasing for "tr" and "az" locales.
582 auto upconvertedCharacters = StringView(*this).upconvertedCharacters();
583 const UChar* source16 = upconvertedCharacters;
584 UChar* data16;
585 auto newString = createUninitialized(length, data16);
586 UErrorCode status = U_ZERO_ERROR;
587 int realLength = u_strToUpper(data16, length, source16, length, "tr", &status);
588 if (U_SUCCESS(status) && realLength == length)
589 return newString;
590 newString = createUninitialized(realLength, data16);
591 status = U_ZERO_ERROR;
592 u_strToUpper(data16, realLength, source16, length, "tr", &status);
593 if (U_FAILURE(status))
594 return *this;
595 return newString;
596}
597
598Ref<StringImpl> StringImpl::foldCase()
599{
600 if (is8Bit()) {
601 unsigned failingIndex;
602 for (unsigned i = 0; i < m_length; ++i) {
603 auto character = m_data8[i];
604 if (UNLIKELY(!isASCII(character) || isASCIIUpper(character))) {
605 failingIndex = i;
606 goto SlowPath;
607 }
608 }
609 // String was all ASCII and no uppercase, so just return as-is.
610 return *this;
611
612SlowPath:
613 bool need16BitCharacters = false;
614 for (unsigned i = failingIndex; i < m_length; ++i) {
615 auto character = m_data8[i];
616 if (character == 0xB5 || character == 0xDF) {
617 need16BitCharacters = true;
618 break;
619 }
620 }
621
622 if (!need16BitCharacters) {
623 LChar* data8;
624 auto folded = createUninitializedInternalNonEmpty(m_length, data8);
625 copyCharacters(data8, m_data8, failingIndex);
626 for (unsigned i = failingIndex; i < m_length; ++i) {
627 auto character = m_data8[i];
628 if (isASCII(character))
629 data8[i] = toASCIILower(character);
630 else {
631 ASSERT(isLatin1(u_foldCase(character, U_FOLD_CASE_DEFAULT)));
632 data8[i] = static_cast<LChar>(u_foldCase(character, U_FOLD_CASE_DEFAULT));
633 }
634 }
635 return folded;
636 }
637 } else {
638 // FIXME: Unclear why we use goto in the 8-bit case, and a different approach in the 16-bit case.
639 bool noUpper = true;
640 unsigned ored = 0;
641 for (unsigned i = 0; i < m_length; ++i) {
642 UChar character = m_data16[i];
643 if (UNLIKELY(isASCIIUpper(character)))
644 noUpper = false;
645 ored |= character;
646 }
647 if (!(ored & ~0x7F)) {
648 if (noUpper) {
649 // String was all ASCII and no uppercase, so just return as-is.
650 return *this;
651 }
652 UChar* data16;
653 auto folded = createUninitializedInternalNonEmpty(m_length, data16);
654 for (unsigned i = 0; i < m_length; ++i)
655 data16[i] = toASCIILower(m_data16[i]);
656 return folded;
657 }
658 }
659
660 if (m_length > MaxLength)
661 CRASH();
662
663 auto upconvertedCharacters = StringView(*this).upconvertedCharacters();
664
665 UChar* data;
666 auto folded = createUninitializedInternalNonEmpty(m_length, data);
667 int32_t length = m_length;
668 UErrorCode status = U_ZERO_ERROR;
669 int32_t realLength = u_strFoldCase(data, length, upconvertedCharacters, length, U_FOLD_CASE_DEFAULT, &status);
670 if (U_SUCCESS(status) && realLength == length)
671 return folded;
672 ASSERT(realLength > length);
673 folded = createUninitializedInternalNonEmpty(realLength, data);
674 status = U_ZERO_ERROR;
675 u_strFoldCase(data, realLength, upconvertedCharacters, length, U_FOLD_CASE_DEFAULT, &status);
676 if (U_FAILURE(status))
677 return *this;
678 return folded;
679}
680
681template<StringImpl::CaseConvertType type, typename CharacterType>
682ALWAYS_INLINE Ref<StringImpl> StringImpl::convertASCIICase(StringImpl& impl, const CharacterType* data, unsigned length)
683{
684 unsigned failingIndex;
685 for (unsigned i = 0; i < length; ++i) {
686 CharacterType character = data[i];
687 if (type == CaseConvertType::Lower ? UNLIKELY(isASCIIUpper(character)) : LIKELY(isASCIILower(character))) {
688 failingIndex = i;
689 goto SlowPath;
690 }
691 }
692 return impl;
693
694SlowPath:
695 CharacterType* newData;
696 auto newImpl = createUninitializedInternalNonEmpty(length, newData);
697 copyCharacters(newData, data, failingIndex);
698 for (unsigned i = failingIndex; i < length; ++i)
699 newData[i] = type == CaseConvertType::Lower ? toASCIILower(data[i]) : toASCIIUpper(data[i]);
700 return newImpl;
701}
702
703Ref<StringImpl> StringImpl::convertToASCIILowercase()
704{
705 if (is8Bit())
706 return convertASCIICase<CaseConvertType::Lower>(*this, m_data8, m_length);
707 return convertASCIICase<CaseConvertType::Lower>(*this, m_data16, m_length);
708}
709
710Ref<StringImpl> StringImpl::convertToASCIIUppercase()
711{
712 if (is8Bit())
713 return convertASCIICase<CaseConvertType::Upper>(*this, m_data8, m_length);
714 return convertASCIICase<CaseConvertType::Upper>(*this, m_data16, m_length);
715}
716
717template<typename CodeUnitPredicate> inline Ref<StringImpl> StringImpl::stripMatchedCharacters(CodeUnitPredicate predicate)
718{
719 if (!m_length)
720 return *this;
721
722 unsigned start = 0;
723 unsigned end = m_length - 1;
724
725 // skip white space from start
726 while (start <= end && predicate(is8Bit() ? m_data8[start] : m_data16[start]))
727 ++start;
728
729 // only white space
730 if (start > end)
731 return *empty();
732
733 // skip white space from end
734 while (end && predicate(is8Bit() ? m_data8[end] : m_data16[end]))
735 --end;
736
737 if (!start && end == m_length - 1)
738 return *this;
739 if (is8Bit())
740 return create(m_data8 + start, end + 1 - start);
741 return create(m_data16 + start, end + 1 - start);
742}
743
744Ref<StringImpl> StringImpl::stripWhiteSpace()
745{
746 return stripMatchedCharacters(isSpaceOrNewline);
747}
748
749Ref<StringImpl> StringImpl::stripLeadingAndTrailingCharacters(CodeUnitMatchFunction predicate)
750{
751 return stripMatchedCharacters(predicate);
752}
753
754template<typename CharacterType> ALWAYS_INLINE Ref<StringImpl> StringImpl::removeCharacters(const CharacterType* characters, CodeUnitMatchFunction findMatch)
755{
756 auto* from = characters;
757 auto* fromEnd = from + m_length;
758
759 // Assume the common case will not remove any characters
760 while (from != fromEnd && !findMatch(*from))
761 ++from;
762 if (from == fromEnd)
763 return *this;
764
765 StringBuffer<CharacterType> data(m_length);
766 auto* to = data.characters();
767 unsigned outc = from - characters;
768
769 if (outc)
770 copyCharacters(to, characters, outc);
771
772 do {
773 while (from != fromEnd && findMatch(*from))
774 ++from;
775 while (from != fromEnd && !findMatch(*from))
776 to[outc++] = *from++;
777 } while (from != fromEnd);
778
779 data.shrink(outc);
780
781 return adopt(WTFMove(data));
782}
783
784Ref<StringImpl> StringImpl::removeCharacters(CodeUnitMatchFunction findMatch)
785{
786 if (is8Bit())
787 return removeCharacters(characters8(), findMatch);
788 return removeCharacters(characters16(), findMatch);
789}
790
791template<typename CharacterType, class UCharPredicate> inline Ref<StringImpl> StringImpl::simplifyMatchedCharactersToSpace(UCharPredicate predicate)
792{
793 StringBuffer<CharacterType> data(m_length);
794
795 auto* from = characters<CharacterType>();
796 auto* fromEnd = from + m_length;
797 unsigned outc = 0;
798 bool changedToSpace = false;
799
800 auto* to = data.characters();
801
802 while (true) {
803 while (from != fromEnd && predicate(*from)) {
804 if (*from != ' ')
805 changedToSpace = true;
806 ++from;
807 }
808 while (from != fromEnd && !predicate(*from))
809 to[outc++] = *from++;
810 if (from != fromEnd)
811 to[outc++] = ' ';
812 else
813 break;
814 }
815
816 if (outc && to[outc - 1] == ' ')
817 --outc;
818
819 if (outc == m_length && !changedToSpace)
820 return *this;
821
822 data.shrink(outc);
823
824 return adopt(WTFMove(data));
825}
826
827Ref<StringImpl> StringImpl::simplifyWhiteSpace()
828{
829 if (is8Bit())
830 return StringImpl::simplifyMatchedCharactersToSpace<LChar>(isSpaceOrNewline);
831 return StringImpl::simplifyMatchedCharactersToSpace<UChar>(isSpaceOrNewline);
832}
833
834Ref<StringImpl> StringImpl::simplifyWhiteSpace(CodeUnitMatchFunction isWhiteSpace)
835{
836 if (is8Bit())
837 return StringImpl::simplifyMatchedCharactersToSpace<LChar>(isWhiteSpace);
838 return StringImpl::simplifyMatchedCharactersToSpace<UChar>(isWhiteSpace);
839}
840
841int StringImpl::toIntStrict(bool* ok, int base)
842{
843 if (is8Bit())
844 return charactersToIntStrict(characters8(), m_length, ok, base);
845 return charactersToIntStrict(characters16(), m_length, ok, base);
846}
847
848unsigned StringImpl::toUIntStrict(bool* ok, int base)
849{
850 if (is8Bit())
851 return charactersToUIntStrict(characters8(), m_length, ok, base);
852 return charactersToUIntStrict(characters16(), m_length, ok, base);
853}
854
855int64_t StringImpl::toInt64Strict(bool* ok, int base)
856{
857 if (is8Bit())
858 return charactersToInt64Strict(characters8(), m_length, ok, base);
859 return charactersToInt64Strict(characters16(), m_length, ok, base);
860}
861
862uint64_t StringImpl::toUInt64Strict(bool* ok, int base)
863{
864 if (is8Bit())
865 return charactersToUInt64Strict(characters8(), m_length, ok, base);
866 return charactersToUInt64Strict(characters16(), m_length, ok, base);
867}
868
869intptr_t StringImpl::toIntPtrStrict(bool* ok, int base)
870{
871 if (is8Bit())
872 return charactersToIntPtrStrict(characters8(), m_length, ok, base);
873 return charactersToIntPtrStrict(characters16(), m_length, ok, base);
874}
875
876int StringImpl::toInt(bool* ok)
877{
878 if (is8Bit())
879 return charactersToInt(characters8(), m_length, ok);
880 return charactersToInt(characters16(), m_length, ok);
881}
882
883unsigned StringImpl::toUInt(bool* ok)
884{
885 if (is8Bit())
886 return charactersToUInt(characters8(), m_length, ok);
887 return charactersToUInt(characters16(), m_length, ok);
888}
889
890int64_t StringImpl::toInt64(bool* ok)
891{
892 if (is8Bit())
893 return charactersToInt64(characters8(), m_length, ok);
894 return charactersToInt64(characters16(), m_length, ok);
895}
896
897uint64_t StringImpl::toUInt64(bool* ok)
898{
899 if (is8Bit())
900 return charactersToUInt64(characters8(), m_length, ok);
901 return charactersToUInt64(characters16(), m_length, ok);
902}
903
904intptr_t StringImpl::toIntPtr(bool* ok)
905{
906 if (is8Bit())
907 return charactersToIntPtr(characters8(), m_length, ok);
908 return charactersToIntPtr(characters16(), m_length, ok);
909}
910
911double StringImpl::toDouble(bool* ok)
912{
913 if (is8Bit())
914 return charactersToDouble(characters8(), m_length, ok);
915 return charactersToDouble(characters16(), m_length, ok);
916}
917
918float StringImpl::toFloat(bool* ok)
919{
920 if (is8Bit())
921 return charactersToFloat(characters8(), m_length, ok);
922 return charactersToFloat(characters16(), m_length, ok);
923}
924
925size_t StringImpl::find(CodeUnitMatchFunction matchFunction, unsigned start)
926{
927 if (is8Bit())
928 return WTF::find(characters8(), m_length, matchFunction, start);
929 return WTF::find(characters16(), m_length, matchFunction, start);
930}
931
932size_t StringImpl::find(const LChar* matchString, unsigned index)
933{
934 // Check for null or empty string to match against
935 if (!matchString)
936 return notFound;
937 size_t matchStringLength = strlen(reinterpret_cast<const char*>(matchString));
938 if (matchStringLength > MaxLength)
939 CRASH();
940 unsigned matchLength = matchStringLength;
941 if (!matchLength)
942 return std::min(index, length());
943
944 // Optimization 1: fast case for strings of length 1.
945 if (matchLength == 1) {
946 if (is8Bit())
947 return WTF::find(characters8(), length(), matchString[0], index);
948 return WTF::find(characters16(), length(), *matchString, index);
949 }
950
951 // Check index & matchLength are in range.
952 if (index > length())
953 return notFound;
954 unsigned searchLength = length() - index;
955 if (matchLength > searchLength)
956 return notFound;
957 // delta is the number of additional times to test; delta == 0 means test only once.
958 unsigned delta = searchLength - matchLength;
959
960 // Optimization 2: keep a running hash of the strings,
961 // only call equal if the hashes match.
962
963 if (is8Bit()) {
964 const LChar* searchCharacters = characters8() + index;
965
966 unsigned searchHash = 0;
967 unsigned matchHash = 0;
968 for (unsigned i = 0; i < matchLength; ++i) {
969 searchHash += searchCharacters[i];
970 matchHash += matchString[i];
971 }
972
973 unsigned i = 0;
974 while (searchHash != matchHash || !equal(searchCharacters + i, matchString, matchLength)) {
975 if (i == delta)
976 return notFound;
977 searchHash += searchCharacters[i + matchLength];
978 searchHash -= searchCharacters[i];
979 ++i;
980 }
981 return index + i;
982 }
983
984 const UChar* searchCharacters = characters16() + index;
985
986 unsigned searchHash = 0;
987 unsigned matchHash = 0;
988 for (unsigned i = 0; i < matchLength; ++i) {
989 searchHash += searchCharacters[i];
990 matchHash += matchString[i];
991 }
992
993 unsigned i = 0;
994 while (searchHash != matchHash || !equal(searchCharacters + i, matchString, matchLength)) {
995 if (i == delta)
996 return notFound;
997 searchHash += searchCharacters[i + matchLength];
998 searchHash -= searchCharacters[i];
999 ++i;
1000 }
1001 return index + i;
1002}
1003
1004size_t StringImpl::find(StringImpl* matchString)
1005{
1006 // Check for null string to match against
1007 if (UNLIKELY(!matchString))
1008 return notFound;
1009 unsigned matchLength = matchString->length();
1010
1011 // Optimization 1: fast case for strings of length 1.
1012 if (matchLength == 1) {
1013 if (is8Bit()) {
1014 if (matchString->is8Bit())
1015 return WTF::find(characters8(), length(), matchString->characters8()[0]);
1016 return WTF::find(characters8(), length(), matchString->characters16()[0]);
1017 }
1018 if (matchString->is8Bit())
1019 return WTF::find(characters16(), length(), matchString->characters8()[0]);
1020 return WTF::find(characters16(), length(), matchString->characters16()[0]);
1021 }
1022
1023 // Check matchLength is in range.
1024 if (matchLength > length())
1025 return notFound;
1026
1027 // Check for empty string to match against
1028 if (UNLIKELY(!matchLength))
1029 return 0;
1030
1031 if (is8Bit()) {
1032 if (matchString->is8Bit())
1033 return findInner(characters8(), matchString->characters8(), 0, length(), matchLength);
1034 return findInner(characters8(), matchString->characters16(), 0, length(), matchLength);
1035 }
1036
1037 if (matchString->is8Bit())
1038 return findInner(characters16(), matchString->characters8(), 0, length(), matchLength);
1039
1040 return findInner(characters16(), matchString->characters16(), 0, length(), matchLength);
1041}
1042
1043size_t StringImpl::find(StringImpl* matchString, unsigned index)
1044{
1045 // Check for null or empty string to match against
1046 if (UNLIKELY(!matchString))
1047 return notFound;
1048
1049 return findCommon(*this, *matchString, index);
1050}
1051
1052size_t StringImpl::findIgnoringASCIICase(const StringImpl& matchString) const
1053{
1054 return ::WTF::findIgnoringASCIICase(*this, matchString, 0);
1055}
1056
1057size_t StringImpl::findIgnoringASCIICase(const StringImpl& matchString, unsigned startOffset) const
1058{
1059 return ::WTF::findIgnoringASCIICase(*this, matchString, startOffset);
1060}
1061
1062size_t StringImpl::findIgnoringASCIICase(const StringImpl* matchString) const
1063{
1064 if (!matchString)
1065 return notFound;
1066 return ::WTF::findIgnoringASCIICase(*this, *matchString, 0);
1067}
1068
1069size_t StringImpl::findIgnoringASCIICase(const StringImpl* matchString, unsigned startOffset) const
1070{
1071 if (!matchString)
1072 return notFound;
1073 return ::WTF::findIgnoringASCIICase(*this, *matchString, startOffset);
1074}
1075
1076size_t StringImpl::reverseFind(UChar character, unsigned index)
1077{
1078 if (is8Bit())
1079 return WTF::reverseFind(characters8(), m_length, character, index);
1080 return WTF::reverseFind(characters16(), m_length, character, index);
1081}
1082
1083template <typename SearchCharacterType, typename MatchCharacterType>
1084ALWAYS_INLINE static size_t reverseFindInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned length, unsigned matchLength)
1085{
1086 // Optimization: keep a running hash of the strings,
1087 // only call equal if the hashes match.
1088
1089 // delta is the number of additional times to test; delta == 0 means test only once.
1090 unsigned delta = std::min(index, length - matchLength);
1091
1092 unsigned searchHash = 0;
1093 unsigned matchHash = 0;
1094 for (unsigned i = 0; i < matchLength; ++i) {
1095 searchHash += searchCharacters[delta + i];
1096 matchHash += matchCharacters[i];
1097 }
1098
1099 // keep looping until we match
1100 while (searchHash != matchHash || !equal(searchCharacters + delta, matchCharacters, matchLength)) {
1101 if (!delta)
1102 return notFound;
1103 --delta;
1104 searchHash -= searchCharacters[delta + matchLength];
1105 searchHash += searchCharacters[delta];
1106 }
1107 return delta;
1108}
1109
1110size_t StringImpl::reverseFind(StringImpl* matchString, unsigned index)
1111{
1112 // Check for null or empty string to match against
1113 if (!matchString)
1114 return notFound;
1115 unsigned matchLength = matchString->length();
1116 unsigned ourLength = length();
1117 if (!matchLength)
1118 return std::min(index, ourLength);
1119
1120 // Optimization 1: fast case for strings of length 1.
1121 if (matchLength == 1) {
1122 if (is8Bit())
1123 return WTF::reverseFind(characters8(), ourLength, (*matchString)[0], index);
1124 return WTF::reverseFind(characters16(), ourLength, (*matchString)[0], index);
1125 }
1126
1127 // Check index & matchLength are in range.
1128 if (matchLength > ourLength)
1129 return notFound;
1130
1131 if (is8Bit()) {
1132 if (matchString->is8Bit())
1133 return reverseFindInner(characters8(), matchString->characters8(), index, ourLength, matchLength);
1134 return reverseFindInner(characters8(), matchString->characters16(), index, ourLength, matchLength);
1135 }
1136
1137 if (matchString->is8Bit())
1138 return reverseFindInner(characters16(), matchString->characters8(), index, ourLength, matchLength);
1139
1140 return reverseFindInner(characters16(), matchString->characters16(), index, ourLength, matchLength);
1141}
1142
1143ALWAYS_INLINE static bool equalInner(const StringImpl& string, unsigned startOffset, const char* matchString, unsigned matchLength)
1144{
1145 ASSERT(matchLength <= string.length());
1146 ASSERT(startOffset + matchLength <= string.length());
1147
1148 if (string.is8Bit())
1149 return equal(string.characters8() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength);
1150 return equal(string.characters16() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength);
1151}
1152
1153ALWAYS_INLINE static bool equalInner(const StringImpl& string, unsigned startOffset, const StringImpl& matchString)
1154{
1155 if (startOffset > string.length())
1156 return false;
1157 if (matchString.length() > string.length())
1158 return false;
1159 if (matchString.length() + startOffset > string.length())
1160 return false;
1161
1162 if (string.is8Bit()) {
1163 if (matchString.is8Bit())
1164 return equal(string.characters8() + startOffset, matchString.characters8(), matchString.length());
1165 return equal(string.characters8() + startOffset, matchString.characters16(), matchString.length());
1166 }
1167 if (matchString.is8Bit())
1168 return equal(string.characters16() + startOffset, matchString.characters8(), matchString.length());
1169 return equal(string.characters16() + startOffset, matchString.characters16(), matchString.length());
1170}
1171
1172bool StringImpl::startsWith(const StringImpl* string) const
1173{
1174 return string && ::WTF::startsWith(*this, *string);
1175}
1176
1177bool StringImpl::startsWith(const StringImpl& string) const
1178{
1179 return ::WTF::startsWith(*this, string);
1180}
1181
1182bool StringImpl::startsWithIgnoringASCIICase(const StringImpl* prefix) const
1183{
1184 return prefix && ::WTF::startsWithIgnoringASCIICase(*this, *prefix);
1185}
1186
1187bool StringImpl::startsWithIgnoringASCIICase(const StringImpl& prefix) const
1188{
1189 return ::WTF::startsWithIgnoringASCIICase(*this, prefix);
1190}
1191
1192bool StringImpl::startsWith(UChar character) const
1193{
1194 return m_length && (*this)[0] == character;
1195}
1196
1197bool StringImpl::startsWith(const char* matchString, unsigned matchLength) const
1198{
1199 return matchLength <= length() && equalInner(*this, 0, matchString, matchLength);
1200}
1201
1202bool StringImpl::hasInfixStartingAt(const StringImpl& matchString, unsigned startOffset) const
1203{
1204 return equalInner(*this, startOffset, matchString);
1205}
1206
1207bool StringImpl::endsWith(StringImpl* suffix)
1208{
1209 return suffix && ::WTF::endsWith(*this, *suffix);
1210}
1211
1212bool StringImpl::endsWith(StringImpl& suffix)
1213{
1214 return ::WTF::endsWith(*this, suffix);
1215}
1216
1217bool StringImpl::endsWithIgnoringASCIICase(const StringImpl* suffix) const
1218{
1219 return suffix && ::WTF::endsWithIgnoringASCIICase(*this, *suffix);
1220}
1221
1222bool StringImpl::endsWithIgnoringASCIICase(const StringImpl& suffix) const
1223{
1224 return ::WTF::endsWithIgnoringASCIICase(*this, suffix);
1225}
1226
1227bool StringImpl::endsWith(UChar character) const
1228{
1229 return m_length && (*this)[m_length - 1] == character;
1230}
1231
1232bool StringImpl::endsWith(const char* matchString, unsigned matchLength) const
1233{
1234 return matchLength <= length() && equalInner(*this, length() - matchLength, matchString, matchLength);
1235}
1236
1237bool StringImpl::hasInfixEndingAt(const StringImpl& matchString, unsigned endOffset) const
1238{
1239 return endOffset >= matchString.length() && equalInner(*this, endOffset - matchString.length(), matchString);
1240}
1241
1242Ref<StringImpl> StringImpl::replace(UChar target, UChar replacement)
1243{
1244 if (target == replacement)
1245 return *this;
1246 unsigned i;
1247 for (i = 0; i != m_length; ++i) {
1248 UChar character = is8Bit() ? m_data8[i] : m_data16[i];
1249 if (character == target)
1250 break;
1251 }
1252 if (i == m_length)
1253 return *this;
1254
1255 if (is8Bit()) {
1256 if (!isLatin1(target)) {
1257 // Looking for a 16-bit character in an 8-bit string, so we're done.
1258 return *this;
1259 }
1260
1261 if (isLatin1(replacement)) {
1262 LChar* data;
1263 LChar oldChar = static_cast<LChar>(target);
1264 LChar newChar = static_cast<LChar>(replacement);
1265
1266 auto newImpl = createUninitializedInternalNonEmpty(m_length, data);
1267
1268 for (i = 0; i != m_length; ++i) {
1269 LChar character = m_data8[i];
1270 if (character == oldChar)
1271 character = newChar;
1272 data[i] = character;
1273 }
1274 return newImpl;
1275 }
1276
1277 UChar* data;
1278 auto newImpl = createUninitializedInternalNonEmpty(m_length, data);
1279
1280 for (i = 0; i != m_length; ++i) {
1281 UChar character = m_data8[i];
1282 if (character == target)
1283 character = replacement;
1284 data[i] = character;
1285 }
1286
1287 return newImpl;
1288 }
1289
1290 UChar* data;
1291 auto newImpl = createUninitializedInternalNonEmpty(m_length, data);
1292
1293 for (i = 0; i != m_length; ++i) {
1294 UChar character = m_data16[i];
1295 if (character == target)
1296 character = replacement;
1297 data[i] = character;
1298 }
1299 return newImpl;
1300}
1301
1302Ref<StringImpl> StringImpl::replace(unsigned position, unsigned lengthToReplace, StringImpl* string)
1303{
1304 position = std::min(position, length());
1305 lengthToReplace = std::min(lengthToReplace, length() - position);
1306 unsigned lengthToInsert = string ? string->length() : 0;
1307 if (!lengthToReplace && !lengthToInsert)
1308 return *this;
1309
1310 if ((length() - lengthToReplace) >= (MaxLength - lengthToInsert))
1311 CRASH();
1312
1313 if (is8Bit() && (!string || string->is8Bit())) {
1314 LChar* data;
1315 auto newImpl = createUninitialized(length() - lengthToReplace + lengthToInsert, data);
1316 copyCharacters(data, m_data8, position);
1317 if (string)
1318 copyCharacters(data + position, string->m_data8, lengthToInsert);
1319 copyCharacters(data + position + lengthToInsert, m_data8 + position + lengthToReplace, length() - position - lengthToReplace);
1320 return newImpl;
1321 }
1322 UChar* data;
1323 auto newImpl = createUninitialized(length() - lengthToReplace + lengthToInsert, data);
1324 if (is8Bit())
1325 copyCharacters(data, m_data8, position);
1326 else
1327 copyCharacters(data, m_data16, position);
1328 if (string) {
1329 if (string->is8Bit())
1330 copyCharacters(data + position, string->m_data8, lengthToInsert);
1331 else
1332 copyCharacters(data + position, string->m_data16, lengthToInsert);
1333 }
1334 if (is8Bit())
1335 copyCharacters(data + position + lengthToInsert, m_data8 + position + lengthToReplace, length() - position - lengthToReplace);
1336 else
1337 copyCharacters(data + position + lengthToInsert, m_data16 + position + lengthToReplace, length() - position - lengthToReplace);
1338 return newImpl;
1339}
1340
1341Ref<StringImpl> StringImpl::replace(UChar pattern, StringImpl* replacement)
1342{
1343 if (!replacement)
1344 return *this;
1345 if (replacement->is8Bit())
1346 return replace(pattern, replacement->m_data8, replacement->length());
1347 return replace(pattern, replacement->m_data16, replacement->length());
1348}
1349
1350Ref<StringImpl> StringImpl::replace(UChar pattern, const LChar* replacement, unsigned repStrLength)
1351{
1352 ASSERT(replacement);
1353
1354 size_t srcSegmentStart = 0;
1355 unsigned matchCount = 0;
1356
1357 // Count the matches.
1358 while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) {
1359 ++matchCount;
1360 ++srcSegmentStart;
1361 }
1362
1363 // If we have 0 matches then we don't have to do any more work.
1364 if (!matchCount)
1365 return *this;
1366
1367 if (repStrLength && matchCount > MaxLength / repStrLength)
1368 CRASH();
1369
1370 unsigned replaceSize = matchCount * repStrLength;
1371 unsigned newSize = m_length - matchCount;
1372 if (newSize >= (MaxLength - replaceSize))
1373 CRASH();
1374
1375 newSize += replaceSize;
1376
1377 // Construct the new data.
1378 size_t srcSegmentEnd;
1379 unsigned srcSegmentLength;
1380 srcSegmentStart = 0;
1381 unsigned dstOffset = 0;
1382
1383 if (is8Bit()) {
1384 LChar* data;
1385 auto newImpl = createUninitialized(newSize, data);
1386
1387 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1388 srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1389 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1390 dstOffset += srcSegmentLength;
1391 copyCharacters(data + dstOffset, replacement, repStrLength);
1392 dstOffset += repStrLength;
1393 srcSegmentStart = srcSegmentEnd + 1;
1394 }
1395
1396 srcSegmentLength = m_length - srcSegmentStart;
1397 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1398
1399 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1400
1401 return newImpl;
1402 }
1403
1404 UChar* data;
1405 auto newImpl = createUninitialized(newSize, data);
1406
1407 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1408 srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1409 copyCharacters(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength);
1410
1411 dstOffset += srcSegmentLength;
1412 copyCharacters(data + dstOffset, replacement, repStrLength);
1413
1414 dstOffset += repStrLength;
1415 srcSegmentStart = srcSegmentEnd + 1;
1416 }
1417
1418 srcSegmentLength = m_length - srcSegmentStart;
1419 copyCharacters(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength);
1420
1421 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1422
1423 return newImpl;
1424}
1425
1426Ref<StringImpl> StringImpl::replace(UChar pattern, const UChar* replacement, unsigned repStrLength)
1427{
1428 ASSERT(replacement);
1429
1430 size_t srcSegmentStart = 0;
1431 unsigned matchCount = 0;
1432
1433 // Count the matches.
1434 while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) {
1435 ++matchCount;
1436 ++srcSegmentStart;
1437 }
1438
1439 // If we have 0 matches then we don't have to do any more work.
1440 if (!matchCount)
1441 return *this;
1442
1443 if (repStrLength && matchCount > MaxLength / repStrLength)
1444 CRASH();
1445
1446 unsigned replaceSize = matchCount * repStrLength;
1447 unsigned newSize = m_length - matchCount;
1448 if (newSize >= (MaxLength - replaceSize))
1449 CRASH();
1450
1451 newSize += replaceSize;
1452
1453 // Construct the new data.
1454 size_t srcSegmentEnd;
1455 unsigned srcSegmentLength;
1456 srcSegmentStart = 0;
1457 unsigned dstOffset = 0;
1458
1459 if (is8Bit()) {
1460 UChar* data;
1461 auto newImpl = createUninitialized(newSize, data);
1462
1463 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1464 srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1465 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1466
1467 dstOffset += srcSegmentLength;
1468 copyCharacters(data + dstOffset, replacement, repStrLength);
1469
1470 dstOffset += repStrLength;
1471 srcSegmentStart = srcSegmentEnd + 1;
1472 }
1473
1474 srcSegmentLength = m_length - srcSegmentStart;
1475 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1476
1477 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1478
1479 return newImpl;
1480 }
1481
1482 UChar* data;
1483 auto newImpl = createUninitialized(newSize, data);
1484
1485 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1486 srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1487 copyCharacters(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength);
1488
1489 dstOffset += srcSegmentLength;
1490 copyCharacters(data + dstOffset, replacement, repStrLength);
1491
1492 dstOffset += repStrLength;
1493 srcSegmentStart = srcSegmentEnd + 1;
1494 }
1495
1496 srcSegmentLength = m_length - srcSegmentStart;
1497 copyCharacters(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength);
1498
1499 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1500
1501 return newImpl;
1502}
1503
1504Ref<StringImpl> StringImpl::replace(StringImpl* pattern, StringImpl* replacement)
1505{
1506 if (!pattern || !replacement)
1507 return *this;
1508
1509 unsigned patternLength = pattern->length();
1510 if (!patternLength)
1511 return *this;
1512
1513 unsigned repStrLength = replacement->length();
1514 size_t srcSegmentStart = 0;
1515 unsigned matchCount = 0;
1516
1517 // Count the matches.
1518 while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) {
1519 ++matchCount;
1520 srcSegmentStart += patternLength;
1521 }
1522
1523 // If we have 0 matches, we don't have to do any more work
1524 if (!matchCount)
1525 return *this;
1526
1527 unsigned newSize = m_length - matchCount * patternLength;
1528 if (repStrLength && matchCount > MaxLength / repStrLength)
1529 CRASH();
1530
1531 if (newSize > (MaxLength - matchCount * repStrLength))
1532 CRASH();
1533
1534 newSize += matchCount * repStrLength;
1535
1536
1537 // Construct the new data
1538 size_t srcSegmentEnd;
1539 unsigned srcSegmentLength;
1540 srcSegmentStart = 0;
1541 unsigned dstOffset = 0;
1542 bool srcIs8Bit = is8Bit();
1543 bool replacementIs8Bit = replacement->is8Bit();
1544
1545 // There are 4 cases:
1546 // 1. This and replacement are both 8 bit.
1547 // 2. This and replacement are both 16 bit.
1548 // 3. This is 8 bit and replacement is 16 bit.
1549 // 4. This is 16 bit and replacement is 8 bit.
1550 if (srcIs8Bit && replacementIs8Bit) {
1551 // Case 1
1552 LChar* data;
1553 auto newImpl = createUninitialized(newSize, data);
1554 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1555 srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1556 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1557 dstOffset += srcSegmentLength;
1558 copyCharacters(data + dstOffset, replacement->m_data8, repStrLength);
1559 dstOffset += repStrLength;
1560 srcSegmentStart = srcSegmentEnd + patternLength;
1561 }
1562
1563 srcSegmentLength = m_length - srcSegmentStart;
1564 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1565
1566 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1567
1568 return newImpl;
1569 }
1570
1571 UChar* data;
1572 auto newImpl = createUninitialized(newSize, data);
1573 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1574 srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1575 if (srcIs8Bit) {
1576 // Case 3.
1577 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1578 } else {
1579 // Case 2 & 4.
1580 copyCharacters(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength);
1581 }
1582 dstOffset += srcSegmentLength;
1583 if (replacementIs8Bit) {
1584 // Cases 2 & 3.
1585 copyCharacters(data + dstOffset, replacement->m_data8, repStrLength);
1586 } else {
1587 // Case 4
1588 copyCharacters(data + dstOffset, replacement->m_data16, repStrLength);
1589 }
1590 dstOffset += repStrLength;
1591 srcSegmentStart = srcSegmentEnd + patternLength;
1592 }
1593
1594 srcSegmentLength = m_length - srcSegmentStart;
1595 if (srcIs8Bit) {
1596 // Case 3.
1597 copyCharacters(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength);
1598 } else {
1599 // Cases 2 & 4.
1600 copyCharacters(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength);
1601 }
1602
1603 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1604
1605 return newImpl;
1606}
1607
1608bool equal(const StringImpl* a, const StringImpl* b)
1609{
1610 return equalCommon(a, b);
1611}
1612
1613template<typename CharacterType> inline bool equalInternal(const StringImpl* a, const CharacterType* b, unsigned length)
1614{
1615 if (!a)
1616 return !b;
1617 if (!b)
1618 return false;
1619
1620 if (a->length() != length)
1621 return false;
1622 if (a->is8Bit())
1623 return equal(a->characters8(), b, length);
1624 return equal(a->characters16(), b, length);
1625}
1626
1627bool equal(const StringImpl* a, const LChar* b, unsigned length)
1628{
1629 return equalInternal(a, b, length);
1630}
1631
1632bool equal(const StringImpl* a, const UChar* b, unsigned length)
1633{
1634 return equalInternal(a, b, length);
1635}
1636
1637bool equal(const StringImpl* a, const LChar* b)
1638{
1639 if (!a)
1640 return !b;
1641 if (!b)
1642 return !a;
1643
1644 unsigned length = a->length();
1645
1646 if (a->is8Bit()) {
1647 const LChar* aPtr = a->characters8();
1648 for (unsigned i = 0; i != length; ++i) {
1649 LChar bc = b[i];
1650 LChar ac = aPtr[i];
1651 if (!bc)
1652 return false;
1653 if (ac != bc)
1654 return false;
1655 }
1656
1657 return !b[length];
1658 }
1659
1660 const UChar* aPtr = a->characters16();
1661 for (unsigned i = 0; i != length; ++i) {
1662 LChar bc = b[i];
1663 if (!bc)
1664 return false;
1665 if (aPtr[i] != bc)
1666 return false;
1667 }
1668
1669 return !b[length];
1670}
1671
1672bool equal(const StringImpl& a, const StringImpl& b)
1673{
1674 return equalCommon(a, b);
1675}
1676
1677bool equalIgnoringNullity(StringImpl* a, StringImpl* b)
1678{
1679 if (!a && b && !b->length())
1680 return true;
1681 if (!b && a && !a->length())
1682 return true;
1683 return equal(a, b);
1684}
1685
1686bool equalIgnoringASCIICase(const StringImpl* a, const StringImpl* b)
1687{
1688 return a == b || (a && b && equalIgnoringASCIICase(*a, *b));
1689}
1690
1691bool equalIgnoringASCIICaseNonNull(const StringImpl* a, const StringImpl* b)
1692{
1693 ASSERT(a);
1694 ASSERT(b);
1695 return equalIgnoringASCIICase(*a, *b);
1696}
1697
1698UCharDirection StringImpl::defaultWritingDirection(bool* hasStrongDirectionality)
1699{
1700 for (unsigned i = 0; i < m_length; ++i) {
1701 auto charDirection = u_charDirection(is8Bit() ? m_data8[i] : m_data16[i]);
1702 if (charDirection == U_LEFT_TO_RIGHT) {
1703 if (hasStrongDirectionality)
1704 *hasStrongDirectionality = true;
1705 return U_LEFT_TO_RIGHT;
1706 }
1707 if (charDirection == U_RIGHT_TO_LEFT || charDirection == U_RIGHT_TO_LEFT_ARABIC) {
1708 if (hasStrongDirectionality)
1709 *hasStrongDirectionality = true;
1710 return U_RIGHT_TO_LEFT;
1711 }
1712 }
1713 if (hasStrongDirectionality)
1714 *hasStrongDirectionality = false;
1715 return U_LEFT_TO_RIGHT;
1716}
1717
1718Ref<StringImpl> StringImpl::adopt(StringBuffer<LChar>&& buffer)
1719{
1720 unsigned length = buffer.length();
1721 if (!length)
1722 return *empty();
1723 return adoptRef(*new StringImpl(buffer.release(), length));
1724}
1725
1726Ref<StringImpl> StringImpl::adopt(StringBuffer<UChar>&& buffer)
1727{
1728 unsigned length = buffer.length();
1729 if (!length)
1730 return *empty();
1731 return adoptRef(*new StringImpl(buffer.release(), length));
1732}
1733
1734size_t StringImpl::sizeInBytes() const
1735{
1736 // FIXME: support substrings
1737 size_t size = length();
1738 if (!is8Bit())
1739 size *= 2;
1740 return size + sizeof(*this);
1741}
1742
1743// Helper to write a three-byte UTF-8 code point into the buffer; caller must ensure room is available.
1744static inline void putUTF8Triple(char*& buffer, UChar character)
1745{
1746 ASSERT(character >= 0x0800);
1747 *buffer++ = static_cast<char>(((character >> 12) & 0x0F) | 0xE0);
1748 *buffer++ = static_cast<char>(((character >> 6) & 0x3F) | 0x80);
1749 *buffer++ = static_cast<char>((character & 0x3F) | 0x80);
1750}
1751
1752UTF8ConversionError StringImpl::utf8Impl(const UChar* characters, unsigned length, char*& buffer, size_t bufferSize, ConversionMode mode)
1753{
1754 if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) {
1755 const UChar* charactersEnd = characters + length;
1756 char* bufferEnd = buffer + bufferSize;
1757 while (characters < charactersEnd) {
1758 // Use strict conversion to detect unpaired surrogates.
1759 auto result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd);
1760 ASSERT(result != TargetExhausted);
1761 // Conversion fails when there is an unpaired surrogate.
1762 // Put replacement character (U+FFFD) instead of the unpaired surrogate.
1763 if (result != ConversionOK) {
1764 ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
1765 // There should be room left, since one UChar hasn't been converted.
1766 ASSERT((buffer + 3) <= bufferEnd);
1767 putUTF8Triple(buffer, replacementCharacter);
1768 ++characters;
1769 }
1770 }
1771 } else {
1772 bool strict = mode == StrictConversion;
1773 const UChar* originalCharacters = characters;
1774 auto result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferSize, strict);
1775 ASSERT(result != TargetExhausted); // (length * 3) should be sufficient for any conversion
1776
1777 // Only produced from strict conversion.
1778 if (result == SourceIllegal) {
1779 ASSERT(strict);
1780 return UTF8ConversionError::IllegalSource;
1781 }
1782
1783 // Check for an unconverted high surrogate.
1784 if (result == SourceExhausted) {
1785 if (strict)
1786 return UTF8ConversionError::SourceExhausted;
1787 // This should be one unpaired high surrogate. Treat it the same
1788 // was as an unpaired high surrogate would have been handled in
1789 // the middle of a string with non-strict conversion - which is
1790 // to say, simply encode it to UTF-8.
1791 ASSERT_UNUSED(
1792 originalCharacters, (characters + 1) == (originalCharacters + length));
1793 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
1794 // There should be room left, since one UChar hasn't been converted.
1795 ASSERT((buffer + 3) <= (buffer + bufferSize));
1796 putUTF8Triple(buffer, *characters);
1797 }
1798 }
1799
1800 return UTF8ConversionError::None;
1801}
1802
1803Expected<CString, UTF8ConversionError> StringImpl::utf8ForCharacters(const LChar* characters, unsigned length)
1804{
1805 if (!length)
1806 return CString("", 0);
1807 if (length > MaxLength / 3)
1808 return makeUnexpected(UTF8ConversionError::OutOfMemory);
1809 Vector<char, 1024> bufferVector(length * 3);
1810 char* buffer = bufferVector.data();
1811 const LChar* source = characters;
1812 bool success = convertLatin1ToUTF8(&source, source + length, &buffer, buffer + bufferVector.size());
1813 ASSERT_UNUSED(success, success); // (length * 3) should be sufficient for any conversion
1814 return CString(bufferVector.data(), buffer - bufferVector.data());
1815}
1816
1817Expected<CString, UTF8ConversionError> StringImpl::utf8ForCharacters(const UChar* characters, unsigned length, ConversionMode mode)
1818{
1819 if (!length)
1820 return CString("", 0);
1821 if (length > MaxLength / 3)
1822 return makeUnexpected(UTF8ConversionError::OutOfMemory);
1823 Vector<char, 1024> bufferVector(length * 3);
1824 char* buffer = bufferVector.data();
1825 UTF8ConversionError error = utf8Impl(characters, length, buffer, bufferVector.size(), mode);
1826 if (error != UTF8ConversionError::None)
1827 return makeUnexpected(error);
1828 return CString(bufferVector.data(), buffer - bufferVector.data());
1829}
1830
1831Expected<CString, UTF8ConversionError> StringImpl::tryGetUtf8ForRange(unsigned offset, unsigned length, ConversionMode mode) const
1832{
1833 ASSERT(offset <= this->length());
1834 ASSERT(offset + length <= this->length());
1835
1836 if (!length)
1837 return CString("", 0);
1838
1839 // Allocate a buffer big enough to hold all the characters
1840 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
1841 // Optimization ideas, if we find this function is hot:
1842 // * We could speculatively create a CStringBuffer to contain 'length'
1843 // characters, and resize if necessary (i.e. if the buffer contains
1844 // non-ascii characters). (Alternatively, scan the buffer first for
1845 // ascii characters, so we know this will be sufficient).
1846 // * We could allocate a CStringBuffer with an appropriate size to
1847 // have a good chance of being able to write the string into the
1848 // buffer without reallocing (say, 1.5 x length).
1849 if (length > MaxLength / 3)
1850 return makeUnexpected(UTF8ConversionError::OutOfMemory);
1851 Vector<char, 1024> bufferVector(length * 3);
1852
1853 char* buffer = bufferVector.data();
1854
1855 if (is8Bit()) {
1856 const LChar* characters = this->characters8() + offset;
1857 auto success = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
1858 ASSERT_UNUSED(success, success); // (length * 3) should be sufficient for any conversion
1859 } else {
1860 UTF8ConversionError error = utf8Impl(this->characters16() + offset, length, buffer, bufferVector.size(), mode);
1861 if (error != UTF8ConversionError::None)
1862 return makeUnexpected(error);
1863 }
1864
1865 return CString(bufferVector.data(), buffer - bufferVector.data());
1866}
1867
1868Expected<CString, UTF8ConversionError> StringImpl::tryGetUtf8(ConversionMode mode) const
1869{
1870 return tryGetUtf8ForRange(0, length(), mode);
1871}
1872
1873CString StringImpl::utf8(ConversionMode mode) const
1874{
1875 auto expectedString = tryGetUtf8ForRange(0, length(), mode);
1876 RELEASE_ASSERT(expectedString);
1877 return expectedString.value();
1878}
1879
1880NEVER_INLINE unsigned StringImpl::hashSlowCase() const
1881{
1882 if (is8Bit())
1883 setHash(StringHasher::computeHashAndMaskTop8Bits(m_data8, m_length));
1884 else
1885 setHash(StringHasher::computeHashAndMaskTop8Bits(m_data16, m_length));
1886 return existingHash();
1887}
1888
1889unsigned StringImpl::concurrentHash() const
1890{
1891 unsigned hash;
1892 if (is8Bit())
1893 hash = StringHasher::computeHashAndMaskTop8Bits(m_data8, m_length);
1894 else
1895 hash = StringHasher::computeHashAndMaskTop8Bits(m_data16, m_length);
1896 ASSERT(((hash << s_flagCount) >> s_flagCount) == hash);
1897 return hash;
1898}
1899
1900bool equalIgnoringNullity(const UChar* a, size_t aLength, StringImpl* b)
1901{
1902 if (!b)
1903 return !aLength;
1904 if (aLength != b->length())
1905 return false;
1906 if (b->is8Bit()) {
1907 const LChar* bCharacters = b->characters8();
1908 for (unsigned i = 0; i < aLength; ++i) {
1909 if (a[i] != bCharacters[i])
1910 return false;
1911 }
1912 return true;
1913 }
1914 return !memcmp(a, b->characters16(), b->length() * sizeof(UChar));
1915}
1916
1917} // namespace WTF
1918