1 | /* |
2 | * Copyright (C) 2007-2019 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2010 Patrick Gansterer <[email protected]> |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * |
14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | */ |
26 | |
27 | #include "config.h" |
28 | #include <wtf/unicode/UTF8Conversion.h> |
29 | |
30 | #include <wtf/ASCIICType.h> |
31 | #include <wtf/text/StringHasher.h> |
32 | #include <wtf/unicode/CharacterNames.h> |
33 | |
34 | namespace WTF { |
35 | namespace Unicode { |
36 | |
37 | bool convertLatin1ToUTF8(const LChar** sourceStart, const LChar* sourceEnd, char** targetStart, char* targetEnd) |
38 | { |
39 | const LChar* source; |
40 | char* target = *targetStart; |
41 | int i = 0; |
42 | for (source = *sourceStart; source < sourceEnd; ++source) { |
43 | UBool sawError = false; |
44 | // Work around bug in either Windows compiler or old version of ICU, where passing a uint8_t to |
45 | // U8_APPEND warns, by converting from uint8_t to a wider type. |
46 | UChar32 character = *source; |
47 | U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - *targetStart, character, sawError); |
48 | if (sawError) |
49 | return false; |
50 | } |
51 | *sourceStart = source; |
52 | *targetStart = target + i; |
53 | return true; |
54 | } |
55 | |
56 | ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, const UChar* sourceEnd, char** targetStart, char* targetEnd, bool strict) |
57 | { |
58 | ConversionResult result = ConversionOK; |
59 | const UChar* source = *sourceStart; |
60 | char* target = *targetStart; |
61 | UBool sawError = false; |
62 | int i = 0; |
63 | while (source < sourceEnd) { |
64 | UChar32 ch; |
65 | int j = 0; |
66 | U16_NEXT(source, j, sourceEnd - source, ch); |
67 | if (U_IS_SURROGATE(ch)) { |
68 | if (source + j == sourceEnd && U_IS_SURROGATE_LEAD(ch)) { |
69 | result = SourceExhausted; |
70 | break; |
71 | } |
72 | if (strict) { |
73 | result = SourceIllegal; |
74 | break; |
75 | } |
76 | ch = replacementCharacter; |
77 | } |
78 | U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - target, ch, sawError); |
79 | if (sawError) { |
80 | result = TargetExhausted; |
81 | break; |
82 | } |
83 | source += j; |
84 | } |
85 | *sourceStart = source; |
86 | *targetStart = target + i; |
87 | return result; |
88 | } |
89 | |
90 | bool convertUTF8ToUTF16(const char* source, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII) |
91 | { |
92 | RELEASE_ASSERT(sourceEnd - source <= std::numeric_limits<int>::max()); |
93 | UBool error = false; |
94 | UChar* target = *targetStart; |
95 | RELEASE_ASSERT(targetEnd - target <= std::numeric_limits<int>::max()); |
96 | UChar32 orAllData = 0; |
97 | int targetOffset = 0; |
98 | for (int sourceOffset = 0; sourceOffset < sourceEnd - source; ) { |
99 | UChar32 character; |
100 | U8_NEXT(reinterpret_cast<const uint8_t*>(source), sourceOffset, sourceEnd - source, character); |
101 | if (character < 0) |
102 | return false; |
103 | U16_APPEND(target, targetOffset, targetEnd - target, character, error); |
104 | if (error) |
105 | return false; |
106 | orAllData |= character; |
107 | } |
108 | RELEASE_ASSERT(target + targetOffset <= targetEnd); |
109 | *targetStart = target + targetOffset; |
110 | if (sourceAllASCII) |
111 | *sourceAllASCII = isASCII(orAllData); |
112 | return true; |
113 | } |
114 | |
115 | unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length) |
116 | { |
117 | StringHasher stringHasher; |
118 | utf16Length = 0; |
119 | |
120 | int inputOffset = 0; |
121 | int inputLength = dataEnd - data; |
122 | while (inputOffset < inputLength) { |
123 | UChar32 character; |
124 | U8_NEXT(reinterpret_cast<const uint8_t*>(data), inputOffset, inputLength, character); |
125 | if (character < 0) |
126 | return 0; |
127 | |
128 | if (U_IS_BMP(character)) { |
129 | ASSERT(!U_IS_SURROGATE(character)); |
130 | stringHasher.addCharacter(character); |
131 | utf16Length++; |
132 | } else { |
133 | ASSERT(U_IS_SUPPLEMENTARY(character)); |
134 | stringHasher.addCharacters(U16_LEAD(character), U16_TRAIL(character)); |
135 | utf16Length += 2; |
136 | } |
137 | } |
138 | |
139 | dataLength = inputOffset; |
140 | return stringHasher.hashWithTop8BitsMasked(); |
141 | } |
142 | |
143 | bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd) |
144 | { |
145 | while (b < bEnd) { |
146 | int offset = 0; |
147 | UChar32 character; |
148 | U8_NEXT(reinterpret_cast<const uint8_t*>(b), offset, bEnd - b, character); |
149 | if (character < 0) |
150 | return false; |
151 | b += offset; |
152 | |
153 | if (U_IS_BMP(character)) { |
154 | ASSERT(!U_IS_SURROGATE(character)); |
155 | if (*a++ != character) |
156 | return false; |
157 | } else { |
158 | ASSERT(U_IS_SUPPLEMENTARY(character)); |
159 | if (*a++ != U16_LEAD(character)) |
160 | return false; |
161 | if (*a++ != U16_TRAIL(character)) |
162 | return false; |
163 | } |
164 | } |
165 | |
166 | return true; |
167 | } |
168 | |
169 | bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd) |
170 | { |
171 | while (b < bEnd) { |
172 | if (isASCII(*a) || isASCII(*b)) { |
173 | if (*a++ != *b++) |
174 | return false; |
175 | continue; |
176 | } |
177 | |
178 | if (b + 1 == bEnd) |
179 | return false; |
180 | |
181 | if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80) |
182 | return false; |
183 | |
184 | LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F); |
185 | |
186 | b += 2; |
187 | |
188 | if (*a++ != character) |
189 | return false; |
190 | } |
191 | |
192 | return true; |
193 | } |
194 | |
195 | } // namespace Unicode |
196 | } // namespace WTF |
197 | |