1 | /* |
2 | * Copyright (C) 2005-2017 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2010 Patrick Gansterer <[email protected]> |
4 | * |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Library General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2 of the License, or (at your option) any later version. |
9 | * |
10 | * This library is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Library General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU Library General Public License |
16 | * along with this library; see the file COPYING.LIB. If not, write to |
17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
18 | * Boston, MA 02110-1301, USA. |
19 | * |
20 | */ |
21 | |
22 | #pragma once |
23 | |
24 | #include <unicode/utypes.h> |
25 | #include <wtf/text/LChar.h> |
26 | |
27 | namespace WTF { |
28 | |
29 | // Paul Hsieh's SuperFastHash |
30 | // http://www.azillionmonkeys.com/qed/hash.html |
31 | |
32 | // LChar data is interpreted as Latin-1-encoded (zero-extended to 16 bits). |
33 | |
34 | // NOTE: The hash computation here must stay in sync with the create_hash_table script in |
35 | // JavaScriptCore and the CodeGeneratorJS.pm script in WebCore. |
36 | |
37 | // Golden ratio. Arbitrary start value to avoid mapping all zeros to a hash value of zero. |
38 | static constexpr const unsigned stringHashingStartValue = 0x9E3779B9U; |
39 | |
40 | class StringHasher { |
41 | WTF_MAKE_FAST_ALLOCATED; |
42 | public: |
43 | static constexpr const unsigned flagCount = 8; // Save 8 bits for StringImpl to use as flags. |
44 | static constexpr const unsigned maskHash = (1U << (sizeof(unsigned) * 8 - flagCount)) - 1; |
45 | |
46 | struct DefaultConverter { |
47 | template<typename CharType> |
48 | static constexpr UChar convert(CharType character) |
49 | { |
50 | return static_cast<std::make_unsigned_t<CharType>>((character)); |
51 | } |
52 | }; |
53 | |
54 | StringHasher() = default; |
55 | |
56 | // The hasher hashes two characters at a time, and thus an "aligned" hasher is one |
57 | // where an even number of characters have been added. Callers that always add |
58 | // characters two at a time can use the "assuming aligned" functions. |
59 | void addCharactersAssumingAligned(UChar a, UChar b) |
60 | { |
61 | ASSERT(!m_hasPendingCharacter); |
62 | m_hash = calculateWithTwoCharacters(m_hash, a, b); |
63 | } |
64 | |
65 | void addCharacter(UChar character) |
66 | { |
67 | if (m_hasPendingCharacter) { |
68 | m_hasPendingCharacter = false; |
69 | addCharactersAssumingAligned(m_pendingCharacter, character); |
70 | return; |
71 | } |
72 | |
73 | m_pendingCharacter = character; |
74 | m_hasPendingCharacter = true; |
75 | } |
76 | |
77 | void addCharacters(UChar a, UChar b) |
78 | { |
79 | if (m_hasPendingCharacter) { |
80 | #if !ASSERT_DISABLED |
81 | m_hasPendingCharacter = false; |
82 | #endif |
83 | addCharactersAssumingAligned(m_pendingCharacter, a); |
84 | m_pendingCharacter = b; |
85 | #if !ASSERT_DISABLED |
86 | m_hasPendingCharacter = true; |
87 | #endif |
88 | return; |
89 | } |
90 | |
91 | addCharactersAssumingAligned(a, b); |
92 | } |
93 | |
94 | template<typename T, typename Converter> void addCharactersAssumingAligned(const T* data, unsigned length) |
95 | { |
96 | ASSERT(!m_hasPendingCharacter); |
97 | |
98 | bool remainder = length & 1; |
99 | length >>= 1; |
100 | |
101 | while (length--) { |
102 | addCharactersAssumingAligned(Converter::convert(data[0]), Converter::convert(data[1])); |
103 | data += 2; |
104 | } |
105 | |
106 | if (remainder) |
107 | addCharacter(Converter::convert(*data)); |
108 | } |
109 | |
110 | template<typename T> void addCharactersAssumingAligned(const T* data, unsigned length) |
111 | { |
112 | addCharactersAssumingAligned<T, DefaultConverter>(data, length); |
113 | } |
114 | |
115 | template<typename T, typename Converter> void addCharactersAssumingAligned(const T* data) |
116 | { |
117 | ASSERT(!m_hasPendingCharacter); |
118 | |
119 | while (T a = *data++) { |
120 | T b = *data++; |
121 | if (!b) { |
122 | addCharacter(Converter::convert(a)); |
123 | break; |
124 | } |
125 | addCharactersAssumingAligned(Converter::convert(a), Converter::convert(b)); |
126 | } |
127 | } |
128 | |
129 | template<typename T> void addCharactersAssumingAligned(const T* data) |
130 | { |
131 | addCharactersAssumingAligned<T, DefaultConverter>(data); |
132 | } |
133 | |
134 | template<typename T, typename Converter> void addCharacters(const T* data, unsigned length) |
135 | { |
136 | if (!length) |
137 | return; |
138 | if (m_hasPendingCharacter) { |
139 | m_hasPendingCharacter = false; |
140 | addCharactersAssumingAligned(m_pendingCharacter, Converter::convert(*data++)); |
141 | --length; |
142 | } |
143 | addCharactersAssumingAligned<T, Converter>(data, length); |
144 | } |
145 | |
146 | template<typename T> void addCharacters(const T* data, unsigned length) |
147 | { |
148 | addCharacters<T, DefaultConverter>(data, length); |
149 | } |
150 | |
151 | template<typename T, typename Converter> void addCharacters(const T* data) |
152 | { |
153 | if (m_hasPendingCharacter && *data) { |
154 | m_hasPendingCharacter = false; |
155 | addCharactersAssumingAligned(m_pendingCharacter, Converter::convert(*data++)); |
156 | } |
157 | addCharactersAssumingAligned<T, Converter>(data); |
158 | } |
159 | |
160 | template<typename T> void addCharacters(const T* data) |
161 | { |
162 | addCharacters<T, DefaultConverter>(data); |
163 | } |
164 | |
165 | unsigned hashWithTop8BitsMasked() const |
166 | { |
167 | return finalizeAndMaskTop8Bits(processPendingCharacter()); |
168 | } |
169 | |
170 | unsigned hash() const |
171 | { |
172 | return finalize(processPendingCharacter()); |
173 | } |
174 | |
175 | template<typename T, typename Converter> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) |
176 | { |
177 | return finalizeAndMaskTop8Bits(computeHashImpl<T, Converter>(data, length)); |
178 | } |
179 | |
180 | template<typename T, typename Converter> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data) |
181 | { |
182 | return finalizeAndMaskTop8Bits(computeHashImpl<T, Converter>(data)); |
183 | } |
184 | |
185 | template<typename T> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) |
186 | { |
187 | return computeHashAndMaskTop8Bits<T, DefaultConverter>(data, length); |
188 | } |
189 | |
190 | template<typename T> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data) |
191 | { |
192 | return computeHashAndMaskTop8Bits<T, DefaultConverter>(data); |
193 | } |
194 | |
195 | template<typename T, typename Converter> static constexpr unsigned computeHash(const T* data, unsigned length) |
196 | { |
197 | return finalize(computeHashImpl<T, Converter>(data, length)); |
198 | } |
199 | |
200 | template<typename T, typename Converter> static constexpr unsigned computeHash(const T* data) |
201 | { |
202 | return finalize(computeHashImpl<T, Converter>(data)); |
203 | } |
204 | |
205 | template<typename T> static constexpr unsigned computeHash(const T* data, unsigned length) |
206 | { |
207 | return computeHash<T, DefaultConverter>(data, length); |
208 | } |
209 | |
210 | template<typename T> static constexpr unsigned computeHash(const T* data) |
211 | { |
212 | return computeHash<T, DefaultConverter>(data); |
213 | } |
214 | |
215 | |
216 | template<typename T, unsigned charactersCount> |
217 | static constexpr unsigned computeLiteralHash(const T (&characters)[charactersCount]) |
218 | { |
219 | return computeHash<T, DefaultConverter>(characters, charactersCount - 1); |
220 | } |
221 | |
222 | template<typename T, unsigned charactersCount> |
223 | static constexpr unsigned computeLiteralHashAndMaskTop8Bits(const T (&characters)[charactersCount]) |
224 | { |
225 | return computeHashAndMaskTop8Bits<T, DefaultConverter>(characters, charactersCount - 1); |
226 | } |
227 | |
228 | static unsigned hashMemory(const void* data, unsigned length) |
229 | { |
230 | size_t lengthInUChar = length / sizeof(UChar); |
231 | StringHasher hasher; |
232 | hasher.addCharactersAssumingAligned(static_cast<const UChar*>(data), lengthInUChar); |
233 | |
234 | for (size_t i = 0; i < length % sizeof(UChar); ++i) |
235 | hasher.addCharacter(static_cast<const char*>(data)[lengthInUChar * sizeof(UChar) + i]); |
236 | |
237 | return hasher.hash(); |
238 | } |
239 | |
240 | template<size_t length> static unsigned hashMemory(const void* data) |
241 | { |
242 | return hashMemory(data, length); |
243 | } |
244 | |
245 | private: |
246 | ALWAYS_INLINE static constexpr unsigned avalancheBits(unsigned hash) |
247 | { |
248 | unsigned result = hash; |
249 | |
250 | result ^= result << 3; |
251 | result += result >> 5; |
252 | result ^= result << 2; |
253 | result += result >> 15; |
254 | result ^= result << 10; |
255 | |
256 | return result; |
257 | } |
258 | |
259 | static constexpr unsigned finalize(unsigned hash) |
260 | { |
261 | return avoidZero(avalancheBits(hash)); |
262 | } |
263 | |
264 | static constexpr unsigned finalizeAndMaskTop8Bits(unsigned hash) |
265 | { |
266 | // Reserving space from the high bits for flags preserves most of the hash's |
267 | // value, since hash lookup typically masks out the high bits anyway. |
268 | return avoidZero(avalancheBits(hash) & StringHasher::maskHash); |
269 | } |
270 | |
271 | // This avoids ever returning a hash code of 0, since that is used to |
272 | // signal "hash not computed yet". Setting the high bit maintains |
273 | // reasonable fidelity to a hash code of 0 because it is likely to yield |
274 | // exactly 0 when hash lookup masks out the high bits. |
275 | ALWAYS_INLINE static constexpr unsigned avoidZero(unsigned hash) |
276 | { |
277 | if (hash) |
278 | return hash; |
279 | return 0x80000000 >> flagCount; |
280 | } |
281 | |
282 | static constexpr unsigned calculateWithRemainingLastCharacter(unsigned hash, unsigned character) |
283 | { |
284 | unsigned result = hash; |
285 | |
286 | result += character; |
287 | result ^= result << 11; |
288 | result += result >> 17; |
289 | |
290 | return result; |
291 | } |
292 | |
293 | ALWAYS_INLINE static constexpr unsigned calculateWithTwoCharacters(unsigned hash, unsigned firstCharacter, unsigned secondCharacter) |
294 | { |
295 | unsigned result = hash; |
296 | |
297 | result += firstCharacter; |
298 | result = (result << 16) ^ ((secondCharacter << 11) ^ result); |
299 | result += result >> 11; |
300 | |
301 | return result; |
302 | } |
303 | |
304 | template<typename T, typename Converter> |
305 | static constexpr unsigned computeHashImpl(const T* characters, unsigned length) |
306 | { |
307 | unsigned result = stringHashingStartValue; |
308 | bool remainder = length & 1; |
309 | length >>= 1; |
310 | |
311 | while (length--) { |
312 | result = calculateWithTwoCharacters(result, Converter::convert(characters[0]), Converter::convert(characters[1])); |
313 | characters += 2; |
314 | } |
315 | |
316 | if (remainder) |
317 | return calculateWithRemainingLastCharacter(result, Converter::convert(characters[0])); |
318 | return result; |
319 | } |
320 | |
321 | template<typename T, typename Converter> |
322 | static constexpr unsigned computeHashImpl(const T* characters) |
323 | { |
324 | unsigned result = stringHashingStartValue; |
325 | while (T a = *characters++) { |
326 | T b = *characters++; |
327 | if (!b) |
328 | return calculateWithRemainingLastCharacter(result, Converter::convert(a)); |
329 | result = calculateWithTwoCharacters(result, Converter::convert(a), Converter::convert(b)); |
330 | } |
331 | return result; |
332 | } |
333 | |
334 | unsigned processPendingCharacter() const |
335 | { |
336 | unsigned result = m_hash; |
337 | |
338 | // Handle end case. |
339 | if (m_hasPendingCharacter) |
340 | return calculateWithRemainingLastCharacter(result, m_pendingCharacter); |
341 | return result; |
342 | } |
343 | |
344 | unsigned m_hash { stringHashingStartValue }; |
345 | UChar m_pendingCharacter { 0 }; |
346 | bool m_hasPendingCharacter { false }; |
347 | }; |
348 | |
349 | } // namespace WTF |
350 | |
351 | using WTF::StringHasher; |
352 | |