1 | /* |
2 | * Copyright (C) 2007-2019 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2010 Patrick Gansterer <[email protected]> |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * |
14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | */ |
26 | |
27 | #include "config.h" |
28 | #include <wtf/unicode/UTF8Conversion.h> |
29 | |
30 | #include <wtf/ASCIICType.h> |
31 | #include <wtf/text/StringHasher.h> |
32 | #include <wtf/unicode/CharacterNames.h> |
33 | |
34 | namespace WTF { |
35 | namespace Unicode { |
36 | |
37 | bool convertLatin1ToUTF8(const LChar** sourceStart, const LChar* sourceEnd, char** targetStart, char* targetEnd) |
38 | { |
39 | const LChar* source; |
40 | char* target = *targetStart; |
41 | int i = 0; |
42 | for (source = *sourceStart; source < sourceEnd; ++source) { |
43 | UBool sawError = false; |
44 | // Work around bug in either Windows compiler or old version of ICU, where passing a uint8_t to |
45 | // U8_APPEND warns, by converting from uint8_t to a wider type. |
46 | UChar32 character = *source; |
47 | U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - *targetStart, character, sawError); |
48 | if (sawError) |
49 | return false; |
50 | } |
51 | *sourceStart = source; |
52 | *targetStart = target + i; |
53 | return true; |
54 | } |
55 | |
56 | ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, const UChar* sourceEnd, char** targetStart, char* targetEnd, bool strict) |
57 | { |
58 | ConversionResult result = ConversionOK; |
59 | const UChar* source = *sourceStart; |
60 | char* target = *targetStart; |
61 | UBool sawError = false; |
62 | int i = 0; |
63 | while (source < sourceEnd) { |
64 | UChar32 ch; |
65 | int j = 0; |
66 | U16_NEXT(source, j, sourceEnd - source, ch); |
67 | if (U_IS_SURROGATE(ch)) { |
68 | if (source + j == sourceEnd && U_IS_SURROGATE_LEAD(ch)) { |
69 | result = SourceExhausted; |
70 | break; |
71 | } |
72 | if (strict) { |
73 | result = SourceIllegal; |
74 | break; |
75 | } |
76 | ch = replacementCharacter; |
77 | } |
78 | U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - target, ch, sawError); |
79 | if (sawError) { |
80 | result = TargetExhausted; |
81 | break; |
82 | } |
83 | source += j; |
84 | } |
85 | *sourceStart = source; |
86 | *targetStart = target + i; |
87 | return result; |
88 | } |
89 | |
90 | bool convertUTF8ToUTF16(const char* source, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII) |
91 | { |
92 | RELEASE_ASSERT(sourceEnd - source <= std::numeric_limits<int>::max()); |
93 | UBool error = false; |
94 | UChar* target = *targetStart; |
95 | UChar32 orAllData = 0; |
96 | int targetOffset = 0; |
97 | for (int sourceOffset = 0; sourceOffset < sourceEnd - source; ) { |
98 | UChar32 character; |
99 | U8_NEXT(reinterpret_cast<const uint8_t*>(source), sourceOffset, sourceEnd - source, character); |
100 | if (character < 0) |
101 | return false; |
102 | U16_APPEND(target, targetOffset, targetEnd - target, character, error); |
103 | if (error) |
104 | return false; |
105 | orAllData |= character; |
106 | } |
107 | *targetStart = target + targetOffset; |
108 | if (sourceAllASCII) |
109 | *sourceAllASCII = isASCII(orAllData); |
110 | return true; |
111 | } |
112 | |
113 | unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length) |
114 | { |
115 | StringHasher stringHasher; |
116 | utf16Length = 0; |
117 | |
118 | int inputOffset = 0; |
119 | int inputLength = dataEnd - data; |
120 | while (inputOffset < inputLength) { |
121 | UChar32 character; |
122 | U8_NEXT(reinterpret_cast<const uint8_t*>(data), inputOffset, inputLength, character); |
123 | if (character < 0) |
124 | return 0; |
125 | |
126 | if (U_IS_BMP(character)) { |
127 | ASSERT(!U_IS_SURROGATE(character)); |
128 | stringHasher.addCharacter(character); |
129 | utf16Length++; |
130 | } else { |
131 | ASSERT(U_IS_SUPPLEMENTARY(character)); |
132 | stringHasher.addCharacters(U16_LEAD(character), U16_TRAIL(character)); |
133 | utf16Length += 2; |
134 | } |
135 | } |
136 | |
137 | dataLength = inputOffset; |
138 | return stringHasher.hashWithTop8BitsMasked(); |
139 | } |
140 | |
141 | bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd) |
142 | { |
143 | while (b < bEnd) { |
144 | int offset = 0; |
145 | UChar32 character; |
146 | U8_NEXT(reinterpret_cast<const uint8_t*>(b), offset, bEnd - b, character); |
147 | if (character < 0) |
148 | return false; |
149 | b += offset; |
150 | |
151 | if (U_IS_BMP(character)) { |
152 | ASSERT(!U_IS_SURROGATE(character)); |
153 | if (*a++ != character) |
154 | return false; |
155 | } else { |
156 | ASSERT(U_IS_SUPPLEMENTARY(character)); |
157 | if (*a++ != U16_LEAD(character)) |
158 | return false; |
159 | if (*a++ != U16_TRAIL(character)) |
160 | return false; |
161 | } |
162 | } |
163 | |
164 | return true; |
165 | } |
166 | |
167 | bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd) |
168 | { |
169 | while (b < bEnd) { |
170 | if (isASCII(*a) || isASCII(*b)) { |
171 | if (*a++ != *b++) |
172 | return false; |
173 | continue; |
174 | } |
175 | |
176 | if (b + 1 == bEnd) |
177 | return false; |
178 | |
179 | if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80) |
180 | return false; |
181 | |
182 | LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F); |
183 | |
184 | b += 2; |
185 | |
186 | if (*a++ != character) |
187 | return false; |
188 | } |
189 | |
190 | return true; |
191 | } |
192 | |
193 | } // namespace Unicode |
194 | } // namespace WTF |
195 | |