1 | /* |
2 | * Copyright (C) 2005-2017 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2010 Patrick Gansterer <[email protected]> |
4 | * |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Library General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2 of the License, or (at your option) any later version. |
9 | * |
10 | * This library is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Library General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU Library General Public License |
16 | * along with this library; see the file COPYING.LIB. If not, write to |
17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
18 | * Boston, MA 02110-1301, USA. |
19 | * |
20 | */ |
21 | |
22 | #pragma once |
23 | |
24 | #include <unicode/utypes.h> |
25 | #include <wtf/text/LChar.h> |
26 | |
27 | namespace WTF { |
28 | |
29 | // Paul Hsieh's SuperFastHash |
30 | // http://www.azillionmonkeys.com/qed/hash.html |
31 | |
32 | // LChar data is interpreted as Latin-1-encoded (zero-extended to 16 bits). |
33 | |
34 | // NOTE: The hash computation here must stay in sync with the create_hash_table script in |
35 | // JavaScriptCore and the CodeGeneratorJS.pm script in WebCore. |
36 | |
37 | // Golden ratio. Arbitrary start value to avoid mapping all zeros to a hash value of zero. |
38 | static constexpr const unsigned stringHashingStartValue = 0x9E3779B9U; |
39 | |
40 | class StringHasher { |
41 | public: |
42 | static constexpr const unsigned flagCount = 8; // Save 8 bits for StringImpl to use as flags. |
43 | static constexpr const unsigned maskHash = (1U << (sizeof(unsigned) * 8 - flagCount)) - 1; |
44 | |
45 | struct DefaultConverter { |
46 | template<typename CharType> |
47 | static constexpr UChar convert(CharType character) |
48 | { |
49 | return static_cast<std::make_unsigned_t<CharType>>((character)); |
50 | } |
51 | }; |
52 | |
53 | StringHasher() = default; |
54 | |
55 | // The hasher hashes two characters at a time, and thus an "aligned" hasher is one |
56 | // where an even number of characters have been added. Callers that always add |
57 | // characters two at a time can use the "assuming aligned" functions. |
58 | void addCharactersAssumingAligned(UChar a, UChar b) |
59 | { |
60 | ASSERT(!m_hasPendingCharacter); |
61 | m_hash = calculateWithTwoCharacters(m_hash, a, b); |
62 | } |
63 | |
64 | void addCharacter(UChar character) |
65 | { |
66 | if (m_hasPendingCharacter) { |
67 | m_hasPendingCharacter = false; |
68 | addCharactersAssumingAligned(m_pendingCharacter, character); |
69 | return; |
70 | } |
71 | |
72 | m_pendingCharacter = character; |
73 | m_hasPendingCharacter = true; |
74 | } |
75 | |
76 | void addCharacters(UChar a, UChar b) |
77 | { |
78 | if (m_hasPendingCharacter) { |
79 | #if !ASSERT_DISABLED |
80 | m_hasPendingCharacter = false; |
81 | #endif |
82 | addCharactersAssumingAligned(m_pendingCharacter, a); |
83 | m_pendingCharacter = b; |
84 | #if !ASSERT_DISABLED |
85 | m_hasPendingCharacter = true; |
86 | #endif |
87 | return; |
88 | } |
89 | |
90 | addCharactersAssumingAligned(a, b); |
91 | } |
92 | |
93 | template<typename T, typename Converter> void addCharactersAssumingAligned(const T* data, unsigned length) |
94 | { |
95 | ASSERT(!m_hasPendingCharacter); |
96 | |
97 | bool remainder = length & 1; |
98 | length >>= 1; |
99 | |
100 | while (length--) { |
101 | addCharactersAssumingAligned(Converter::convert(data[0]), Converter::convert(data[1])); |
102 | data += 2; |
103 | } |
104 | |
105 | if (remainder) |
106 | addCharacter(Converter::convert(*data)); |
107 | } |
108 | |
109 | template<typename T> void addCharactersAssumingAligned(const T* data, unsigned length) |
110 | { |
111 | addCharactersAssumingAligned<T, DefaultConverter>(data, length); |
112 | } |
113 | |
114 | template<typename T, typename Converter> void addCharactersAssumingAligned(const T* data) |
115 | { |
116 | ASSERT(!m_hasPendingCharacter); |
117 | |
118 | while (T a = *data++) { |
119 | T b = *data++; |
120 | if (!b) { |
121 | addCharacter(Converter::convert(a)); |
122 | break; |
123 | } |
124 | addCharactersAssumingAligned(Converter::convert(a), Converter::convert(b)); |
125 | } |
126 | } |
127 | |
128 | template<typename T> void addCharactersAssumingAligned(const T* data) |
129 | { |
130 | addCharactersAssumingAligned<T, DefaultConverter>(data); |
131 | } |
132 | |
133 | template<typename T, typename Converter> void addCharacters(const T* data, unsigned length) |
134 | { |
135 | if (!length) |
136 | return; |
137 | if (m_hasPendingCharacter) { |
138 | m_hasPendingCharacter = false; |
139 | addCharactersAssumingAligned(m_pendingCharacter, Converter::convert(*data++)); |
140 | --length; |
141 | } |
142 | addCharactersAssumingAligned<T, Converter>(data, length); |
143 | } |
144 | |
145 | template<typename T> void addCharacters(const T* data, unsigned length) |
146 | { |
147 | addCharacters<T, DefaultConverter>(data, length); |
148 | } |
149 | |
150 | template<typename T, typename Converter> void addCharacters(const T* data) |
151 | { |
152 | if (m_hasPendingCharacter && *data) { |
153 | m_hasPendingCharacter = false; |
154 | addCharactersAssumingAligned(m_pendingCharacter, Converter::convert(*data++)); |
155 | } |
156 | addCharactersAssumingAligned<T, Converter>(data); |
157 | } |
158 | |
159 | template<typename T> void addCharacters(const T* data) |
160 | { |
161 | addCharacters<T, DefaultConverter>(data); |
162 | } |
163 | |
164 | unsigned hashWithTop8BitsMasked() const |
165 | { |
166 | return finalizeAndMaskTop8Bits(processPendingCharacter()); |
167 | } |
168 | |
169 | unsigned hash() const |
170 | { |
171 | return finalize(processPendingCharacter()); |
172 | } |
173 | |
174 | template<typename T, typename Converter> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) |
175 | { |
176 | return finalizeAndMaskTop8Bits(computeHashImpl<T, Converter>(data, length)); |
177 | } |
178 | |
179 | template<typename T, typename Converter> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data) |
180 | { |
181 | return finalizeAndMaskTop8Bits(computeHashImpl<T, Converter>(data)); |
182 | } |
183 | |
184 | template<typename T> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) |
185 | { |
186 | return computeHashAndMaskTop8Bits<T, DefaultConverter>(data, length); |
187 | } |
188 | |
189 | template<typename T> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data) |
190 | { |
191 | return computeHashAndMaskTop8Bits<T, DefaultConverter>(data); |
192 | } |
193 | |
194 | template<typename T, typename Converter> static constexpr unsigned computeHash(const T* data, unsigned length) |
195 | { |
196 | return finalize(computeHashImpl<T, Converter>(data, length)); |
197 | } |
198 | |
199 | template<typename T, typename Converter> static constexpr unsigned computeHash(const T* data) |
200 | { |
201 | return finalize(computeHashImpl<T, Converter>(data)); |
202 | } |
203 | |
204 | template<typename T> static constexpr unsigned computeHash(const T* data, unsigned length) |
205 | { |
206 | return computeHash<T, DefaultConverter>(data, length); |
207 | } |
208 | |
209 | template<typename T> static constexpr unsigned computeHash(const T* data) |
210 | { |
211 | return computeHash<T, DefaultConverter>(data); |
212 | } |
213 | |
214 | |
215 | template<typename T, unsigned charactersCount> |
216 | static constexpr unsigned computeLiteralHash(const T (&characters)[charactersCount]) |
217 | { |
218 | return computeHash<T, DefaultConverter>(characters, charactersCount - 1); |
219 | } |
220 | |
221 | template<typename T, unsigned charactersCount> |
222 | static constexpr unsigned computeLiteralHashAndMaskTop8Bits(const T (&characters)[charactersCount]) |
223 | { |
224 | return computeHashAndMaskTop8Bits<T, DefaultConverter>(characters, charactersCount - 1); |
225 | } |
226 | |
227 | static unsigned hashMemory(const void* data, unsigned length) |
228 | { |
229 | size_t lengthInUChar = length / sizeof(UChar); |
230 | StringHasher hasher; |
231 | hasher.addCharactersAssumingAligned(static_cast<const UChar*>(data), lengthInUChar); |
232 | |
233 | for (size_t i = 0; i < length % sizeof(UChar); ++i) |
234 | hasher.addCharacter(static_cast<const char*>(data)[lengthInUChar * sizeof(UChar) + i]); |
235 | |
236 | return hasher.hash(); |
237 | } |
238 | |
239 | template<size_t length> static unsigned hashMemory(const void* data) |
240 | { |
241 | return hashMemory(data, length); |
242 | } |
243 | |
244 | private: |
245 | ALWAYS_INLINE static constexpr unsigned avalancheBits(unsigned hash) |
246 | { |
247 | unsigned result = hash; |
248 | |
249 | result ^= result << 3; |
250 | result += result >> 5; |
251 | result ^= result << 2; |
252 | result += result >> 15; |
253 | result ^= result << 10; |
254 | |
255 | return result; |
256 | } |
257 | |
258 | static constexpr unsigned finalize(unsigned hash) |
259 | { |
260 | return avoidZero(avalancheBits(hash)); |
261 | } |
262 | |
263 | static constexpr unsigned finalizeAndMaskTop8Bits(unsigned hash) |
264 | { |
265 | // Reserving space from the high bits for flags preserves most of the hash's |
266 | // value, since hash lookup typically masks out the high bits anyway. |
267 | return avoidZero(avalancheBits(hash) & StringHasher::maskHash); |
268 | } |
269 | |
270 | // This avoids ever returning a hash code of 0, since that is used to |
271 | // signal "hash not computed yet". Setting the high bit maintains |
272 | // reasonable fidelity to a hash code of 0 because it is likely to yield |
273 | // exactly 0 when hash lookup masks out the high bits. |
274 | ALWAYS_INLINE static constexpr unsigned avoidZero(unsigned hash) |
275 | { |
276 | if (hash) |
277 | return hash; |
278 | return 0x80000000 >> flagCount; |
279 | } |
280 | |
281 | static constexpr unsigned calculateWithRemainingLastCharacter(unsigned hash, unsigned character) |
282 | { |
283 | unsigned result = hash; |
284 | |
285 | result += character; |
286 | result ^= result << 11; |
287 | result += result >> 17; |
288 | |
289 | return result; |
290 | } |
291 | |
292 | ALWAYS_INLINE static constexpr unsigned calculateWithTwoCharacters(unsigned hash, unsigned firstCharacter, unsigned secondCharacter) |
293 | { |
294 | unsigned result = hash; |
295 | |
296 | result += firstCharacter; |
297 | result = (result << 16) ^ ((secondCharacter << 11) ^ result); |
298 | result += result >> 11; |
299 | |
300 | return result; |
301 | } |
302 | |
303 | template<typename T, typename Converter> |
304 | static constexpr unsigned computeHashImpl(const T* characters, unsigned length) |
305 | { |
306 | unsigned result = stringHashingStartValue; |
307 | bool remainder = length & 1; |
308 | length >>= 1; |
309 | |
310 | while (length--) { |
311 | result = calculateWithTwoCharacters(result, Converter::convert(characters[0]), Converter::convert(characters[1])); |
312 | characters += 2; |
313 | } |
314 | |
315 | if (remainder) |
316 | return calculateWithRemainingLastCharacter(result, Converter::convert(characters[0])); |
317 | return result; |
318 | } |
319 | |
320 | template<typename T, typename Converter> |
321 | static constexpr unsigned computeHashImpl(const T* characters) |
322 | { |
323 | unsigned result = stringHashingStartValue; |
324 | while (T a = *characters++) { |
325 | T b = *characters++; |
326 | if (!b) |
327 | return calculateWithRemainingLastCharacter(result, Converter::convert(a)); |
328 | result = calculateWithTwoCharacters(result, Converter::convert(a), Converter::convert(b)); |
329 | } |
330 | return result; |
331 | } |
332 | |
333 | unsigned processPendingCharacter() const |
334 | { |
335 | unsigned result = m_hash; |
336 | |
337 | // Handle end case. |
338 | if (m_hasPendingCharacter) |
339 | return calculateWithRemainingLastCharacter(result, m_pendingCharacter); |
340 | return result; |
341 | } |
342 | |
343 | unsigned m_hash { stringHashingStartValue }; |
344 | UChar m_pendingCharacter { 0 }; |
345 | bool m_hasPendingCharacter { false }; |
346 | }; |
347 | |
348 | } // namespace WTF |
349 | |
350 | using WTF::StringHasher; |
351 | |