1/*
2 * Copyright (C) 2013-2019 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "BinarySwitch.h"
28
29#if ENABLE(JIT)
30
31#include "JSCInlines.h"
32#include <wtf/ListDump.h>
33
34namespace JSC {
35
36namespace BinarySwitchInternal {
37static constexpr bool verbose = false;
38}
39
40static unsigned globalCounter; // We use a different seed every time we are invoked.
41
42BinarySwitch::BinarySwitch(GPRReg value, const Vector<int64_t>& cases, Type type)
43 : m_type(type)
44 , m_value(value)
45 , m_weakRandom(globalCounter++)
46 , m_index(0)
47 , m_caseIndex(UINT_MAX)
48{
49 if (cases.isEmpty())
50 return;
51
52 if (BinarySwitchInternal::verbose)
53 dataLog("Original cases: ", listDump(cases), "\n");
54
55 for (unsigned i = 0; i < cases.size(); ++i)
56 m_cases.append(Case(cases[i], i));
57
58 std::sort(m_cases.begin(), m_cases.end());
59
60 if (BinarySwitchInternal::verbose)
61 dataLog("Sorted cases: ", listDump(m_cases), "\n");
62
63#if !ASSERT_DISABLED
64 for (unsigned i = 1; i < m_cases.size(); ++i)
65 ASSERT(m_cases[i - 1] < m_cases[i], i, m_cases.size(), m_cases[i].value, m_cases[i].index);
66#endif
67
68 build(0, false, m_cases.size());
69}
70
71BinarySwitch::~BinarySwitch()
72{
73}
74
75bool BinarySwitch::advance(MacroAssembler& jit)
76{
77 if (m_cases.isEmpty()) {
78 m_fallThrough.append(jit.jump());
79 return false;
80 }
81
82 if (m_index == m_branches.size()) {
83 RELEASE_ASSERT(m_jumpStack.isEmpty());
84 return false;
85 }
86
87 for (;;) {
88 const BranchCode& code = m_branches[m_index++];
89 switch (code.kind) {
90 case NotEqualToFallThrough:
91 switch (m_type) {
92 case Int32:
93 m_fallThrough.append(jit.branch32(
94 MacroAssembler::NotEqual, m_value,
95 MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value))));
96 break;
97 case IntPtr:
98 m_fallThrough.append(jit.branchPtr(
99 MacroAssembler::NotEqual, m_value,
100 MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value)))));
101 break;
102 }
103 break;
104 case NotEqualToPush:
105 switch (m_type) {
106 case Int32:
107 m_jumpStack.append(jit.branch32(
108 MacroAssembler::NotEqual, m_value,
109 MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value))));
110 break;
111 case IntPtr:
112 m_jumpStack.append(jit.branchPtr(
113 MacroAssembler::NotEqual, m_value,
114 MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value)))));
115 break;
116 }
117 break;
118 case LessThanToPush:
119 switch (m_type) {
120 case Int32:
121 m_jumpStack.append(jit.branch32(
122 MacroAssembler::LessThan, m_value,
123 MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value))));
124 break;
125 case IntPtr:
126 m_jumpStack.append(jit.branchPtr(
127 MacroAssembler::LessThan, m_value,
128 MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value)))));
129 break;
130 }
131 break;
132 case Pop:
133 m_jumpStack.takeLast().link(&jit);
134 break;
135 case ExecuteCase:
136 m_caseIndex = code.index;
137 return true;
138 }
139 }
140}
141
142class RandomNumberGenerator {
143public:
144 using result_type = uint32_t;
145
146 RandomNumberGenerator(WeakRandom& weakRandom)
147 : m_weakRandom(weakRandom)
148 {
149 }
150
151 uint32_t operator()()
152 {
153 return m_weakRandom.getUint32();
154 }
155
156 static constexpr uint32_t min() { return std::numeric_limits<uint32_t>::min(); }
157 static constexpr uint32_t max() { return std::numeric_limits<uint32_t>::max(); }
158
159private:
160 WeakRandom& m_weakRandom;
161};
162
163void BinarySwitch::build(unsigned start, bool hardStart, unsigned end)
164{
165 if (BinarySwitchInternal::verbose)
166 dataLog("Building with start = ", start, ", hardStart = ", hardStart, ", end = ", end, "\n");
167
168 auto append = [&] (const BranchCode& code) {
169 if (BinarySwitchInternal::verbose)
170 dataLog("==> ", code, "\n");
171 m_branches.append(code);
172 };
173
174 unsigned size = end - start;
175
176 RELEASE_ASSERT(size);
177
178 // This code uses some random numbers to keep things balanced. It's important to keep in mind
179 // that this does not improve average-case throughput under the assumption that all cases fire
180 // with equal probability. It just ensures that there will not be some switch structure that
181 // when combined with some input will always produce pathologically good or pathologically bad
182 // performance.
183
184 const unsigned leafThreshold = 3;
185
186 if (size <= leafThreshold) {
187 if (BinarySwitchInternal::verbose)
188 dataLog("It's a leaf.\n");
189
190 // It turns out that for exactly three cases or less, it's better to just compare each
191 // case individually. This saves 1/6 of a branch on average, and up to 1/3 of a branch in
192 // extreme cases where the divide-and-conquer bottoms out in a lot of 3-case subswitches.
193 //
194 // This assumes that we care about the cost of hitting some case more than we care about
195 // bottoming out in a default case. I believe that in most places where we use switch
196 // statements, we are more likely to hit one of the cases than we are to fall through to
197 // default. Intuitively, if we wanted to improve the performance of default, we would
198 // reduce the value of leafThreshold to 2 or even to 1. See below for a deeper discussion.
199
200 bool allConsecutive = false;
201
202 if ((hardStart || (start && m_cases[start - 1].value == m_cases[start].value - 1))
203 && start + size < m_cases.size()
204 && m_cases[start + size - 1].value == m_cases[start + size].value - 1) {
205 allConsecutive = true;
206 for (unsigned i = 0; i < size - 1; ++i) {
207 if (m_cases[start + i].value + 1 != m_cases[start + i + 1].value) {
208 allConsecutive = false;
209 break;
210 }
211 }
212 }
213
214 if (BinarySwitchInternal::verbose)
215 dataLog("allConsecutive = ", allConsecutive, "\n");
216
217 Vector<unsigned, 3> localCaseIndices;
218 for (unsigned i = 0; i < size; ++i)
219 localCaseIndices.append(start + i);
220
221 std::shuffle(
222 localCaseIndices.begin(), localCaseIndices.end(),
223 RandomNumberGenerator(m_weakRandom));
224
225 for (unsigned i = 0; i < size - 1; ++i) {
226 append(BranchCode(NotEqualToPush, localCaseIndices[i]));
227 append(BranchCode(ExecuteCase, localCaseIndices[i]));
228 append(BranchCode(Pop));
229 }
230
231 if (!allConsecutive)
232 append(BranchCode(NotEqualToFallThrough, localCaseIndices.last()));
233
234 append(BranchCode(ExecuteCase, localCaseIndices.last()));
235 return;
236 }
237
238 if (BinarySwitchInternal::verbose)
239 dataLog("It's not a leaf.\n");
240
241 // There are two different strategies we could consider here:
242 //
243 // Isolate median and split: pick a median and check if the comparison value is equal to it;
244 // if so, execute the median case. Otherwise check if the value is less than the median, and
245 // recurse left or right based on this. This has two subvariants: we could either first test
246 // equality for the median and then do the less-than, or we could first do the less-than and
247 // then check equality on the not-less-than path.
248 //
249 // Ignore median and split: do a less-than comparison on a value that splits the cases in two
250 // equal-sized halves. Recurse left or right based on the comparison. Do not test for equality
251 // against the median (or anything else); let the recursion handle those equality comparisons
252 // once we bottom out in a list that case 3 cases or less (see above).
253 //
254 // I'll refer to these strategies as Isolate and Ignore. I initially believed that Isolate
255 // would be faster since it leads to less branching for some lucky cases. It turns out that
256 // Isolate is almost a total fail in the average, assuming all cases are equally likely. How
257 // bad Isolate is depends on whether you believe that doing two consecutive branches based on
258 // the same comparison is cheaper than doing the compare/branches separately. This is
259 // difficult to evaluate. For small immediates that aren't blinded, we just care about
260 // avoiding a second compare instruction. For large immediates or when blinding is in play, we
261 // also care about the instructions used to materialize the immediate a second time. Isolate
262 // can help with both costs since it involves first doing a < compare+branch on some value,
263 // followed by a == compare+branch on the same exact value (or vice-versa). Ignore will do a <
264 // compare+branch on some value, and then the == compare+branch on that same value will happen
265 // much later.
266 //
267 // To evaluate these costs, I wrote the recurrence relation for Isolate and Ignore, assuming
268 // that ComparisonCost is the cost of a compare+branch and ChainedComparisonCost is the cost
269 // of a compare+branch on some value that you've just done another compare+branch for. These
270 // recurrence relations compute the total cost incurred if you executed the switch statement
271 // on each matching value. So the average cost of hitting some case can be computed as
272 // Isolate[n]/n or Ignore[n]/n, respectively for the two relations.
273 //
274 // Isolate[1] = ComparisonCost
275 // Isolate[2] = (2 + 1) * ComparisonCost
276 // Isolate[3] = (3 + 2 + 1) * ComparisonCost
277 // Isolate[n_] := With[
278 // {medianIndex = Floor[n/2] + If[EvenQ[n], RandomInteger[], 1]},
279 // ComparisonCost + ChainedComparisonCost +
280 // (ComparisonCost * (medianIndex - 1) + Isolate[medianIndex - 1]) +
281 // (2 * ComparisonCost * (n - medianIndex) + Isolate[n - medianIndex])]
282 //
283 // Ignore[1] = ComparisonCost
284 // Ignore[2] = (2 + 1) * ComparisonCost
285 // Ignore[3] = (3 + 2 + 1) * ComparisonCost
286 // Ignore[n_] := With[
287 // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]},
288 // (medianIndex * ComparisonCost + Ignore[medianIndex]) +
289 // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])]
290 //
291 // This does not account for the average cost of hitting the default case. See further below
292 // for a discussion of that.
293 //
294 // It turns out that for ComparisonCost = 1 and ChainedComparisonCost = 1, Ignore is always
295 // better than Isolate. If we assume that ChainedComparisonCost = 0, then Isolate wins for
296 // switch statements that have 20 cases or fewer, though the margin of victory is never large
297 // - it might sometimes save an average of 0.3 ComparisonCost. For larger switch statements,
298 // we see divergence between the two with Ignore winning. This is of course rather
299 // unrealistic since the chained comparison is never free. For ChainedComparisonCost = 0.5, we
300 // see Isolate winning for 10 cases or fewer, by maybe 0.2 ComparisonCost. Again we see
301 // divergence for large switches with Ignore winning, for example if a switch statement has
302 // 100 cases then Ignore saves one branch on average.
303 //
304 // Our current JIT backends don't provide for optimization for chained comparisons, except for
305 // reducing the code for materializing the immediate if the immediates are large or blinding
306 // comes into play. Probably our JIT backends live somewhere north of
307 // ChainedComparisonCost = 0.5.
308 //
309 // This implies that using the Ignore strategy is likely better. If we wanted to incorporate
310 // the Isolate strategy, we'd want to determine the switch size threshold at which the two
311 // cross over and then use Isolate for switches that are smaller than that size.
312 //
313 // The average cost of hitting the default case is similar, but involves a different cost for
314 // the base cases: you have to assume that you will always fail each branch. For the Ignore
315 // strategy we would get this recurrence relation; the same kind of thing happens to the
316 // Isolate strategy:
317 //
318 // Ignore[1] = ComparisonCost
319 // Ignore[2] = (2 + 2) * ComparisonCost
320 // Ignore[3] = (3 + 3 + 3) * ComparisonCost
321 // Ignore[n_] := With[
322 // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]},
323 // (medianIndex * ComparisonCost + Ignore[medianIndex]) +
324 // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])]
325 //
326 // This means that if we cared about the default case more, we would likely reduce
327 // leafThreshold. Reducing it to 2 would reduce the average cost of the default case by 1/3
328 // in the most extreme cases (num switch cases = 3, 6, 12, 24, ...). But it would also
329 // increase the average cost of taking one of the non-default cases by 1/3. Typically the
330 // difference is 1/6 in either direction. This makes it a very simple trade-off: if we believe
331 // that the default case is more important then we would want leafThreshold to be 2, and the
332 // default case would become 1/6 faster on average. But we believe that most switch statements
333 // are more likely to take one of the cases than the default, so we use leafThreshold = 3
334 // and get a 1/6 speed-up on average for taking an explicit case.
335
336 unsigned medianIndex = (start + end) / 2;
337
338 if (BinarySwitchInternal::verbose)
339 dataLog("medianIndex = ", medianIndex, "\n");
340
341 // We want medianIndex to point to the thing we will do a less-than compare against. We want
342 // this less-than compare to split the current sublist into equal-sized sublists, or
343 // nearly-equal-sized with some randomness if we're in the odd case. With the above
344 // calculation, in the odd case we will have medianIndex pointing at either the element we
345 // want or the element to the left of the one we want. Consider the case of five elements:
346 //
347 // 0 1 2 3 4
348 //
349 // start will be 0, end will be 5. The average is 2.5, which rounds down to 2. If we do
350 // value < 2, then we will split the list into 2 elements on the left and three on the right.
351 // That's pretty good, but in this odd case we'd like to at random choose 3 instead to ensure
352 // that we don't become unbalanced on the right. This does not improve throughput since one
353 // side will always get shafted, and that side might still be odd, in which case it will also
354 // have two sides and one of them will get shafted - and so on. We just want to avoid
355 // deterministic pathologies.
356 //
357 // In the even case, we will always end up pointing at the element we want:
358 //
359 // 0 1 2 3
360 //
361 // start will be 0, end will be 4. So, the average is 2, which is what we'd like.
362 if (size & 1) {
363 RELEASE_ASSERT(medianIndex - start + 1 == end - medianIndex);
364 medianIndex += m_weakRandom.getUint32() & 1;
365 } else
366 RELEASE_ASSERT(medianIndex - start == end - medianIndex);
367
368 RELEASE_ASSERT(medianIndex > start);
369 RELEASE_ASSERT(medianIndex + 1 < end);
370
371 if (BinarySwitchInternal::verbose)
372 dataLog("fixed medianIndex = ", medianIndex, "\n");
373
374 append(BranchCode(LessThanToPush, medianIndex));
375 build(medianIndex, true, end);
376 append(BranchCode(Pop));
377 build(start, hardStart, medianIndex);
378}
379
380void BinarySwitch::Case::dump(PrintStream& out) const
381{
382 out.print("<value: " , value, ", index: ", index, ">");
383}
384
385void BinarySwitch::BranchCode::dump(PrintStream& out) const
386{
387 switch (kind) {
388 case NotEqualToFallThrough:
389 out.print("NotEqualToFallThrough");
390 break;
391 case NotEqualToPush:
392 out.print("NotEqualToPush");
393 break;
394 case LessThanToPush:
395 out.print("LessThanToPush");
396 break;
397 case Pop:
398 out.print("Pop");
399 break;
400 case ExecuteCase:
401 out.print("ExecuteCase");
402 break;
403 }
404
405 if (index != UINT_MAX)
406 out.print("(", index, ")");
407}
408
409} // namespace JSC
410
411#endif // ENABLE(JIT)
412
413