1/*
2 * Copyright (C) 2007-2019 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <[email protected]>
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include <wtf/unicode/UTF8Conversion.h>
29
30#include <wtf/ASCIICType.h>
31#include <wtf/text/StringHasher.h>
32#include <wtf/unicode/CharacterNames.h>
33
34namespace WTF {
35namespace Unicode {
36
37bool convertLatin1ToUTF8(const LChar** sourceStart, const LChar* sourceEnd, char** targetStart, char* targetEnd)
38{
39 const LChar* source;
40 char* target = *targetStart;
41 int i = 0;
42 for (source = *sourceStart; source < sourceEnd; ++source) {
43 UBool sawError = false;
44 // Work around bug in either Windows compiler or old version of ICU, where passing a uint8_t to
45 // U8_APPEND warns, by converting from uint8_t to a wider type.
46 UChar32 character = *source;
47 U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - *targetStart, character, sawError);
48 if (sawError)
49 return false;
50 }
51 *sourceStart = source;
52 *targetStart = target + i;
53 return true;
54}
55
56ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, const UChar* sourceEnd, char** targetStart, char* targetEnd, bool strict)
57{
58 ConversionResult result = ConversionOK;
59 const UChar* source = *sourceStart;
60 char* target = *targetStart;
61 UBool sawError = false;
62 int i = 0;
63 while (source < sourceEnd) {
64 UChar32 ch;
65 int j = 0;
66 U16_NEXT(source, j, sourceEnd - source, ch);
67 if (U_IS_SURROGATE(ch)) {
68 if (source + j == sourceEnd && U_IS_SURROGATE_LEAD(ch)) {
69 result = SourceExhausted;
70 break;
71 }
72 if (strict) {
73 result = SourceIllegal;
74 break;
75 }
76 ch = replacementCharacter;
77 }
78 U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - target, ch, sawError);
79 if (sawError) {
80 result = TargetExhausted;
81 break;
82 }
83 source += j;
84 }
85 *sourceStart = source;
86 *targetStart = target + i;
87 return result;
88}
89
90bool convertUTF8ToUTF16(const char* source, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII)
91{
92 RELEASE_ASSERT(sourceEnd - source <= std::numeric_limits<int>::max());
93 UBool error = false;
94 UChar* target = *targetStart;
95 RELEASE_ASSERT(targetEnd - target <= std::numeric_limits<int>::max());
96 UChar32 orAllData = 0;
97 int targetOffset = 0;
98 for (int sourceOffset = 0; sourceOffset < sourceEnd - source; ) {
99 UChar32 character;
100 U8_NEXT(reinterpret_cast<const uint8_t*>(source), sourceOffset, sourceEnd - source, character);
101 if (character < 0)
102 return false;
103 U16_APPEND(target, targetOffset, targetEnd - target, character, error);
104 if (error)
105 return false;
106 orAllData |= character;
107 }
108 RELEASE_ASSERT(target + targetOffset <= targetEnd);
109 *targetStart = target + targetOffset;
110 if (sourceAllASCII)
111 *sourceAllASCII = isASCII(orAllData);
112 return true;
113}
114
115unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
116{
117 StringHasher stringHasher;
118 utf16Length = 0;
119
120 int inputOffset = 0;
121 int inputLength = dataEnd - data;
122 while (inputOffset < inputLength) {
123 UChar32 character;
124 U8_NEXT(reinterpret_cast<const uint8_t*>(data), inputOffset, inputLength, character);
125 if (character < 0)
126 return 0;
127
128 if (U_IS_BMP(character)) {
129 ASSERT(!U_IS_SURROGATE(character));
130 stringHasher.addCharacter(character);
131 utf16Length++;
132 } else {
133 ASSERT(U_IS_SUPPLEMENTARY(character));
134 stringHasher.addCharacters(U16_LEAD(character), U16_TRAIL(character));
135 utf16Length += 2;
136 }
137 }
138
139 dataLength = inputOffset;
140 return stringHasher.hashWithTop8BitsMasked();
141}
142
143bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd)
144{
145 while (b < bEnd) {
146 int offset = 0;
147 UChar32 character;
148 U8_NEXT(reinterpret_cast<const uint8_t*>(b), offset, bEnd - b, character);
149 if (character < 0)
150 return false;
151 b += offset;
152
153 if (U_IS_BMP(character)) {
154 ASSERT(!U_IS_SURROGATE(character));
155 if (*a++ != character)
156 return false;
157 } else {
158 ASSERT(U_IS_SUPPLEMENTARY(character));
159 if (*a++ != U16_LEAD(character))
160 return false;
161 if (*a++ != U16_TRAIL(character))
162 return false;
163 }
164 }
165
166 return true;
167}
168
169bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd)
170{
171 while (b < bEnd) {
172 if (isASCII(*a) || isASCII(*b)) {
173 if (*a++ != *b++)
174 return false;
175 continue;
176 }
177
178 if (b + 1 == bEnd)
179 return false;
180
181 if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80)
182 return false;
183
184 LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F);
185
186 b += 2;
187
188 if (*a++ != character)
189 return false;
190 }
191
192 return true;
193}
194
195} // namespace Unicode
196} // namespace WTF
197