1/*
2 * Copyright (C) 2007-2019 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <[email protected]>
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include <wtf/unicode/UTF8Conversion.h>
29
30#include <wtf/ASCIICType.h>
31#include <wtf/text/StringHasher.h>
32#include <wtf/unicode/CharacterNames.h>
33
34namespace WTF {
35namespace Unicode {
36
37bool convertLatin1ToUTF8(const LChar** sourceStart, const LChar* sourceEnd, char** targetStart, char* targetEnd)
38{
39 const LChar* source;
40 char* target = *targetStart;
41 int i = 0;
42 for (source = *sourceStart; source < sourceEnd; ++source) {
43 UBool sawError = false;
44 // Work around bug in either Windows compiler or old version of ICU, where passing a uint8_t to
45 // U8_APPEND warns, by converting from uint8_t to a wider type.
46 UChar32 character = *source;
47 U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - *targetStart, character, sawError);
48 if (sawError)
49 return false;
50 }
51 *sourceStart = source;
52 *targetStart = target + i;
53 return true;
54}
55
56ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, const UChar* sourceEnd, char** targetStart, char* targetEnd, bool strict)
57{
58 ConversionResult result = ConversionOK;
59 const UChar* source = *sourceStart;
60 char* target = *targetStart;
61 UBool sawError = false;
62 int i = 0;
63 while (source < sourceEnd) {
64 UChar32 ch;
65 int j = 0;
66 U16_NEXT(source, j, sourceEnd - source, ch);
67 if (U_IS_SURROGATE(ch)) {
68 if (source + j == sourceEnd && U_IS_SURROGATE_LEAD(ch)) {
69 result = SourceExhausted;
70 break;
71 }
72 if (strict) {
73 result = SourceIllegal;
74 break;
75 }
76 ch = replacementCharacter;
77 }
78 U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - target, ch, sawError);
79 if (sawError) {
80 result = TargetExhausted;
81 break;
82 }
83 source += j;
84 }
85 *sourceStart = source;
86 *targetStart = target + i;
87 return result;
88}
89
90bool convertUTF8ToUTF16(const char* source, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII)
91{
92 RELEASE_ASSERT(sourceEnd - source <= std::numeric_limits<int>::max());
93 UBool error = false;
94 UChar* target = *targetStart;
95 UChar32 orAllData = 0;
96 int targetOffset = 0;
97 for (int sourceOffset = 0; sourceOffset < sourceEnd - source; ) {
98 UChar32 character;
99 U8_NEXT(reinterpret_cast<const uint8_t*>(source), sourceOffset, sourceEnd - source, character);
100 if (character < 0)
101 return false;
102 U16_APPEND(target, targetOffset, targetEnd - target, character, error);
103 if (error)
104 return false;
105 orAllData |= character;
106 }
107 *targetStart = target + targetOffset;
108 if (sourceAllASCII)
109 *sourceAllASCII = isASCII(orAllData);
110 return true;
111}
112
113unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
114{
115 StringHasher stringHasher;
116 utf16Length = 0;
117
118 int inputOffset = 0;
119 int inputLength = dataEnd - data;
120 while (inputOffset < inputLength) {
121 UChar32 character;
122 U8_NEXT(reinterpret_cast<const uint8_t*>(data), inputOffset, inputLength, character);
123 if (character < 0)
124 return 0;
125
126 if (U_IS_BMP(character)) {
127 ASSERT(!U_IS_SURROGATE(character));
128 stringHasher.addCharacter(character);
129 utf16Length++;
130 } else {
131 ASSERT(U_IS_SUPPLEMENTARY(character));
132 stringHasher.addCharacters(U16_LEAD(character), U16_TRAIL(character));
133 utf16Length += 2;
134 }
135 }
136
137 dataLength = inputOffset;
138 return stringHasher.hashWithTop8BitsMasked();
139}
140
141bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd)
142{
143 while (b < bEnd) {
144 int offset = 0;
145 UChar32 character;
146 U8_NEXT(reinterpret_cast<const uint8_t*>(b), offset, bEnd - b, character);
147 if (character < 0)
148 return false;
149 b += offset;
150
151 if (U_IS_BMP(character)) {
152 ASSERT(!U_IS_SURROGATE(character));
153 if (*a++ != character)
154 return false;
155 } else {
156 ASSERT(U_IS_SUPPLEMENTARY(character));
157 if (*a++ != U16_LEAD(character))
158 return false;
159 if (*a++ != U16_TRAIL(character))
160 return false;
161 }
162 }
163
164 return true;
165}
166
167bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd)
168{
169 while (b < bEnd) {
170 if (isASCII(*a) || isASCII(*b)) {
171 if (*a++ != *b++)
172 return false;
173 continue;
174 }
175
176 if (b + 1 == bEnd)
177 return false;
178
179 if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80)
180 return false;
181
182 LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F);
183
184 b += 2;
185
186 if (*a++ != character)
187 return false;
188 }
189
190 return true;
191}
192
193} // namespace Unicode
194} // namespace WTF
195