StringHasher.h source code [jsc/Source/WTF/wtf/text/StringHasher.h]

1	/*
2	* Copyright (C) 2005-2017 Apple Inc. All rights reserved.
3	* Copyright (C) 2010 Patrick Gansterer <[email protected]>
4	*
5	* This library is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU Library General Public
7	* License as published by the Free Software Foundation; either
8	* version 2 of the License, or (at your option) any later version.
9	*
10	* This library is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	* Library General Public License for more details.
14	*
15	* You should have received a copy of the GNU Library General Public License
16	* along with this library; see the file COPYING.LIB. If not, write to
17	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18	* Boston, MA 02110-1301, USA.
19	*
20	*/
21
22	#pragma once
23
24	#include <unicode/utypes.h>
25	#include <wtf/text/LChar.h>
26
27	namespace WTF {
28
29	// Paul Hsieh's SuperFastHash
30	// http://www.azillionmonkeys.com/qed/hash.html
31
32	// LChar data is interpreted as Latin-1-encoded (zero-extended to 16 bits).
33
34	// NOTE: The hash computation here must stay in sync with the create_hash_table script in
35	// JavaScriptCore and the CodeGeneratorJS.pm script in WebCore.
36
37	// Golden ratio. Arbitrary start value to avoid mapping all zeros to a hash value of zero.
38	static constexpr const unsigned stringHashingStartValue = `0x9E3779B9U`;
39
40	class StringHasher {
41	public:
42	static constexpr const unsigned flagCount = `8`; // Save 8 bits for StringImpl to use as flags.
43	static constexpr const unsigned maskHash = (`1U` << (sizeof(unsigned) * `8` - flagCount)) - `1`;
44
45	struct DefaultConverter {
46	template<typename CharType>
47	static constexpr UChar convert(CharType character)
48	{
49	return static_cast<std::make_unsigned_t<CharType>>((character));
50	}
51	};
52
53	StringHasher() = default;
54
55	// The hasher hashes two characters at a time, and thus an "aligned" hasher is one
56	// where an even number of characters have been added. Callers that always add
57	// characters two at a time can use the "assuming aligned" functions.
58	void addCharactersAssumingAligned(UChar a, UChar b)
59	{
60	ASSERT(!m_hasPendingCharacter);
61	m_hash = calculateWithTwoCharacters(m_hash, a, b);
62	}
63
64	void addCharacter(UChar character)
65	{
66	if (m_hasPendingCharacter) {
67	m_hasPendingCharacter = false;
68	addCharactersAssumingAligned(m_pendingCharacter, character);
69	return;
70	}
71
72	m_pendingCharacter = character;
73	m_hasPendingCharacter = true;
74	}
75
76	void addCharacters(UChar a, UChar b)
77	{
78	if (m_hasPendingCharacter) {
79	#if !ASSERT_DISABLED
80	m_hasPendingCharacter = false;
81	#endif
82	addCharactersAssumingAligned(m_pendingCharacter, a);
83	m_pendingCharacter = b;
84	#if !ASSERT_DISABLED
85	m_hasPendingCharacter = true;
86	#endif
87	return;
88	}
89
90	addCharactersAssumingAligned(a, b);
91	}
92
93	template<typename T, typename Converter> void addCharactersAssumingAligned(const T* data, unsigned length)
94	{
95	ASSERT(!m_hasPendingCharacter);
96
97	bool remainder = length & `1`;
98	length >>= `1`;
99
100	while (length--) {
101	addCharactersAssumingAligned(Converter::convert(data[`0`]), Converter::convert(data[`1`]));
102	data += `2`;
103	}
104
105	if (remainder)
106	addCharacter(Converter::convert(*data));
107	}
108
109	template<typename T> void addCharactersAssumingAligned(const T* data, unsigned length)
110	{
111	addCharactersAssumingAligned<T, DefaultConverter>(data, length);
112	}
113
114	template<typename T, typename Converter> void addCharactersAssumingAligned(const T* data)
115	{
116	ASSERT(!m_hasPendingCharacter);
117
118	while (T a = *data++) {
119	T b = *data++;
120	if (!b) {
121	addCharacter(Converter::convert(a));
122	break;
123	}
124	addCharactersAssumingAligned(Converter::convert(a), Converter::convert(b));
125	}
126	}
127
128	template<typename T> void addCharactersAssumingAligned(const T* data)
129	{
130	addCharactersAssumingAligned<T, DefaultConverter>(data);
131	}
132
133	template<typename T, typename Converter> void addCharacters(const T* data, unsigned length)
134	{
135	if (!length)
136	return;
137	if (m_hasPendingCharacter) {
138	m_hasPendingCharacter = false;
139	addCharactersAssumingAligned(m_pendingCharacter, Converter::convert(*data++));
140	--length;
141	}
142	addCharactersAssumingAligned<T, Converter>(data, length);
143	}
144
145	template<typename T> void addCharacters(const T* data, unsigned length)
146	{
147	addCharacters<T, DefaultConverter>(data, length);
148	}
149
150	template<typename T, typename Converter> void addCharacters(const T* data)
151	{
152	if (m_hasPendingCharacter && *data) {
153	m_hasPendingCharacter = false;
154	addCharactersAssumingAligned(m_pendingCharacter, Converter::convert(*data++));
155	}
156	addCharactersAssumingAligned<T, Converter>(data);
157	}
158
159	template<typename T> void addCharacters(const T* data)
160	{
161	addCharacters<T, DefaultConverter>(data);
162	}
163
164	unsigned hashWithTop8BitsMasked() const
165	{
166	return finalizeAndMaskTop8Bits(processPendingCharacter());
167	}
168
169	unsigned hash() const
170	{
171	return finalize(processPendingCharacter());
172	}
173
174	template<typename T, typename Converter> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length)
175	{
176	return finalizeAndMaskTop8Bits(computeHashImpl<T, Converter>(data, length));
177	}
178
179	template<typename T, typename Converter> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data)
180	{
181	return finalizeAndMaskTop8Bits(computeHashImpl<T, Converter>(data));
182	}
183
184	template<typename T> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length)
185	{
186	return computeHashAndMaskTop8Bits<T, DefaultConverter>(data, length);
187	}
188
189	template<typename T> static constexpr unsigned computeHashAndMaskTop8Bits(const T* data)
190	{
191	return computeHashAndMaskTop8Bits<T, DefaultConverter>(data);
192	}
193
194	template<typename T, typename Converter> static constexpr unsigned computeHash(const T* data, unsigned length)
195	{
196	return finalize(computeHashImpl<T, Converter>(data, length));
197	}
198
199	template<typename T, typename Converter> static constexpr unsigned computeHash(const T* data)
200	{
201	return finalize(computeHashImpl<T, Converter>(data));
202	}
203
204	template<typename T> static constexpr unsigned computeHash(const T* data, unsigned length)
205	{
206	return computeHash<T, DefaultConverter>(data, length);
207	}
208
209	template<typename T> static constexpr unsigned computeHash(const T* data)
210	{
211	return computeHash<T, DefaultConverter>(data);
212	}
213
214
215	template<typename T, unsigned charactersCount>
216	static constexpr unsigned computeLiteralHash(const T (&characters)[charactersCount])
217	{
218	return computeHash<T, DefaultConverter>(characters, charactersCount - `1`);
219	}
220
221	template<typename T, unsigned charactersCount>
222	static constexpr unsigned computeLiteralHashAndMaskTop8Bits(const T (&characters)[charactersCount])
223	{
224	return computeHashAndMaskTop8Bits<T, DefaultConverter>(characters, charactersCount - `1`);
225	}
226
227	static unsigned hashMemory(const void* data, unsigned length)
228	{
229	size_t lengthInUChar = length / sizeof(UChar);
230	StringHasher hasher;
231	hasher.addCharactersAssumingAligned(static_cast<const UChar*>(data), lengthInUChar);
232
233	for (size_t i = `0`; i < length % sizeof(UChar); ++i)
234	hasher.addCharacter(static_cast<const char>(data)[lengthInUChar sizeof(UChar) + i]);
235
236	return hasher.hash();
237	}
238
239	template<size_t length> static unsigned hashMemory(const void* data)
240	{
241	return hashMemory(data, length);
242	}
243
244	private:
245	ALWAYS_INLINE static constexpr unsigned avalancheBits(unsigned hash)
246	{
247	unsigned result = hash;
248
249	result ^= result << `3`;
250	result += result >> `5`;
251	result ^= result << `2`;
252	result += result >> `15`;
253	result ^= result << `10`;
254
255	return result;
256	}
257
258	static constexpr unsigned finalize(unsigned hash)
259	{
260	return avoidZero(avalancheBits(hash));
261	}
262
263	static constexpr unsigned finalizeAndMaskTop8Bits(unsigned hash)
264	{
265	// Reserving space from the high bits for flags preserves most of the hash's
266	// value, since hash lookup typically masks out the high bits anyway.
267	return avoidZero(avalancheBits(hash) & StringHasher::maskHash);
268	}
269
270	// This avoids ever returning a hash code of 0, since that is used to
271	// signal "hash not computed yet". Setting the high bit maintains
272	// reasonable fidelity to a hash code of 0 because it is likely to yield
273	// exactly 0 when hash lookup masks out the high bits.
274	ALWAYS_INLINE static constexpr unsigned avoidZero(unsigned hash)
275	{
276	if (hash)
277	return hash;
278	return `0x80000000` >> flagCount;
279	}
280
281	static constexpr unsigned calculateWithRemainingLastCharacter(unsigned hash, unsigned character)
282	{
283	unsigned result = hash;
284
285	result += character;
286	result ^= result << `11`;
287	result += result >> `17`;
288
289	return result;
290	}
291
292	ALWAYS_INLINE static constexpr unsigned calculateWithTwoCharacters(unsigned hash, unsigned firstCharacter, unsigned secondCharacter)
293	{
294	unsigned result = hash;
295
296	result += firstCharacter;
297	result = (result << `16`) ^ ((secondCharacter << `11`) ^ result);
298	result += result >> `11`;
299
300	return result;
301	}
302
303	template<typename T, typename Converter>
304	static constexpr unsigned computeHashImpl(const T* characters, unsigned length)
305	{
306	unsigned result = stringHashingStartValue;
307	bool remainder = length & `1`;
308	length >>= `1`;
309
310	while (length--) {
311	result = calculateWithTwoCharacters(result, Converter::convert(characters[`0`]), Converter::convert(characters[`1`]));
312	characters += `2`;
313	}
314
315	if (remainder)
316	return calculateWithRemainingLastCharacter(result, Converter::convert(characters[`0`]));
317	return result;
318	}
319
320	template<typename T, typename Converter>
321	static constexpr unsigned computeHashImpl(const T* characters)
322	{
323	unsigned result = stringHashingStartValue;
324	while (T a = *characters++) {
325	T b = *characters++;
326	if (!b)
327	return calculateWithRemainingLastCharacter(result, Converter::convert(a));
328	result = calculateWithTwoCharacters(result, Converter::convert(a), Converter::convert(b));
329	}
330	return result;
331	}
332
333	unsigned processPendingCharacter() const
334	{
335	unsigned result = m_hash;
336
337	// Handle end case.
338	if (m_hasPendingCharacter)
339	return calculateWithRemainingLastCharacter(result, m_pendingCharacter);
340	return result;
341	}
342
343	unsigned m_hash { stringHashingStartValue };
344	UChar m_pendingCharacter { `0` };
345	bool m_hasPendingCharacter { false };
346	};
347
348	} // namespace WTF
349
350	using WTF::StringHasher;
351

Browse the source code of jsc/Source/WTF/wtf/text/StringHasher.h