1 | /* |
2 | * Copyright (C) 2013-2019 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #include "config.h" |
27 | #include "BinarySwitch.h" |
28 | |
29 | #if ENABLE(JIT) |
30 | |
31 | #include "JSCInlines.h" |
32 | #include <wtf/ListDump.h> |
33 | |
34 | namespace JSC { |
35 | |
36 | namespace BinarySwitchInternal { |
37 | static const bool verbose = false; |
38 | } |
39 | |
40 | static unsigned globalCounter; // We use a different seed every time we are invoked. |
41 | |
42 | BinarySwitch::BinarySwitch(GPRReg value, const Vector<int64_t>& cases, Type type) |
43 | : m_type(type) |
44 | , m_value(value) |
45 | , m_weakRandom(globalCounter++) |
46 | , m_index(0) |
47 | , m_caseIndex(UINT_MAX) |
48 | { |
49 | if (cases.isEmpty()) |
50 | return; |
51 | |
52 | if (BinarySwitchInternal::verbose) |
53 | dataLog("Original cases: " , listDump(cases), "\n" ); |
54 | |
55 | for (unsigned i = 0; i < cases.size(); ++i) |
56 | m_cases.append(Case(cases[i], i)); |
57 | |
58 | std::sort(m_cases.begin(), m_cases.end()); |
59 | |
60 | if (BinarySwitchInternal::verbose) |
61 | dataLog("Sorted cases: " , listDump(m_cases), "\n" ); |
62 | |
63 | #if !ASSERT_DISABLED |
64 | for (unsigned i = 1; i < m_cases.size(); ++i) |
65 | ASSERT(m_cases[i - 1] < m_cases[i], i, m_cases.size(), m_cases[i].value, m_cases[i].index); |
66 | #endif |
67 | |
68 | build(0, false, m_cases.size()); |
69 | } |
70 | |
71 | BinarySwitch::~BinarySwitch() |
72 | { |
73 | } |
74 | |
75 | bool BinarySwitch::advance(MacroAssembler& jit) |
76 | { |
77 | if (m_cases.isEmpty()) { |
78 | m_fallThrough.append(jit.jump()); |
79 | return false; |
80 | } |
81 | |
82 | if (m_index == m_branches.size()) { |
83 | RELEASE_ASSERT(m_jumpStack.isEmpty()); |
84 | return false; |
85 | } |
86 | |
87 | for (;;) { |
88 | const BranchCode& code = m_branches[m_index++]; |
89 | switch (code.kind) { |
90 | case NotEqualToFallThrough: |
91 | switch (m_type) { |
92 | case Int32: |
93 | m_fallThrough.append(jit.branch32( |
94 | MacroAssembler::NotEqual, m_value, |
95 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value)))); |
96 | break; |
97 | case IntPtr: |
98 | m_fallThrough.append(jit.branchPtr( |
99 | MacroAssembler::NotEqual, m_value, |
100 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value))))); |
101 | break; |
102 | } |
103 | break; |
104 | case NotEqualToPush: |
105 | switch (m_type) { |
106 | case Int32: |
107 | m_jumpStack.append(jit.branch32( |
108 | MacroAssembler::NotEqual, m_value, |
109 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value)))); |
110 | break; |
111 | case IntPtr: |
112 | m_jumpStack.append(jit.branchPtr( |
113 | MacroAssembler::NotEqual, m_value, |
114 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value))))); |
115 | break; |
116 | } |
117 | break; |
118 | case LessThanToPush: |
119 | switch (m_type) { |
120 | case Int32: |
121 | m_jumpStack.append(jit.branch32( |
122 | MacroAssembler::LessThan, m_value, |
123 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value)))); |
124 | break; |
125 | case IntPtr: |
126 | m_jumpStack.append(jit.branchPtr( |
127 | MacroAssembler::LessThan, m_value, |
128 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value))))); |
129 | break; |
130 | } |
131 | break; |
132 | case Pop: |
133 | m_jumpStack.takeLast().link(&jit); |
134 | break; |
135 | case ExecuteCase: |
136 | m_caseIndex = code.index; |
137 | return true; |
138 | } |
139 | } |
140 | } |
141 | |
142 | class RandomNumberGenerator { |
143 | public: |
144 | using result_type = uint32_t; |
145 | |
146 | RandomNumberGenerator(WeakRandom& weakRandom) |
147 | : m_weakRandom(weakRandom) |
148 | { |
149 | } |
150 | |
151 | uint32_t operator()() |
152 | { |
153 | return m_weakRandom.getUint32(); |
154 | } |
155 | |
156 | static constexpr uint32_t min() { return std::numeric_limits<uint32_t>::min(); } |
157 | static constexpr uint32_t max() { return std::numeric_limits<uint32_t>::max(); } |
158 | |
159 | private: |
160 | WeakRandom& m_weakRandom; |
161 | }; |
162 | |
163 | void BinarySwitch::build(unsigned start, bool hardStart, unsigned end) |
164 | { |
165 | if (BinarySwitchInternal::verbose) |
166 | dataLog("Building with start = " , start, ", hardStart = " , hardStart, ", end = " , end, "\n" ); |
167 | |
168 | auto append = [&] (const BranchCode& code) { |
169 | if (BinarySwitchInternal::verbose) |
170 | dataLog("==> " , code, "\n" ); |
171 | m_branches.append(code); |
172 | }; |
173 | |
174 | unsigned size = end - start; |
175 | |
176 | RELEASE_ASSERT(size); |
177 | |
178 | // This code uses some random numbers to keep things balanced. It's important to keep in mind |
179 | // that this does not improve average-case throughput under the assumption that all cases fire |
180 | // with equal probability. It just ensures that there will not be some switch structure that |
181 | // when combined with some input will always produce pathologically good or pathologically bad |
182 | // performance. |
183 | |
184 | const unsigned leafThreshold = 3; |
185 | |
186 | if (size <= leafThreshold) { |
187 | if (BinarySwitchInternal::verbose) |
188 | dataLog("It's a leaf.\n" ); |
189 | |
190 | // It turns out that for exactly three cases or less, it's better to just compare each |
191 | // case individually. This saves 1/6 of a branch on average, and up to 1/3 of a branch in |
192 | // extreme cases where the divide-and-conquer bottoms out in a lot of 3-case subswitches. |
193 | // |
194 | // This assumes that we care about the cost of hitting some case more than we care about |
195 | // bottoming out in a default case. I believe that in most places where we use switch |
196 | // statements, we are more likely to hit one of the cases than we are to fall through to |
197 | // default. Intuitively, if we wanted to improve the performance of default, we would |
198 | // reduce the value of leafThreshold to 2 or even to 1. See below for a deeper discussion. |
199 | |
200 | bool allConsecutive = false; |
201 | |
202 | if ((hardStart || (start && m_cases[start - 1].value == m_cases[start].value - 1)) |
203 | && start + size < m_cases.size() |
204 | && m_cases[start + size - 1].value == m_cases[start + size].value - 1) { |
205 | allConsecutive = true; |
206 | for (unsigned i = 0; i < size - 1; ++i) { |
207 | if (m_cases[start + i].value + 1 != m_cases[start + i + 1].value) { |
208 | allConsecutive = false; |
209 | break; |
210 | } |
211 | } |
212 | } |
213 | |
214 | if (BinarySwitchInternal::verbose) |
215 | dataLog("allConsecutive = " , allConsecutive, "\n" ); |
216 | |
217 | Vector<unsigned, 3> localCaseIndices; |
218 | for (unsigned i = 0; i < size; ++i) |
219 | localCaseIndices.append(start + i); |
220 | |
221 | std::shuffle( |
222 | localCaseIndices.begin(), localCaseIndices.end(), |
223 | RandomNumberGenerator(m_weakRandom)); |
224 | |
225 | for (unsigned i = 0; i < size - 1; ++i) { |
226 | append(BranchCode(NotEqualToPush, localCaseIndices[i])); |
227 | append(BranchCode(ExecuteCase, localCaseIndices[i])); |
228 | append(BranchCode(Pop)); |
229 | } |
230 | |
231 | if (!allConsecutive) |
232 | append(BranchCode(NotEqualToFallThrough, localCaseIndices.last())); |
233 | |
234 | append(BranchCode(ExecuteCase, localCaseIndices.last())); |
235 | return; |
236 | } |
237 | |
238 | if (BinarySwitchInternal::verbose) |
239 | dataLog("It's not a leaf.\n" ); |
240 | |
241 | // There are two different strategies we could consider here: |
242 | // |
243 | // Isolate median and split: pick a median and check if the comparison value is equal to it; |
244 | // if so, execute the median case. Otherwise check if the value is less than the median, and |
245 | // recurse left or right based on this. This has two subvariants: we could either first test |
246 | // equality for the median and then do the less-than, or we could first do the less-than and |
247 | // then check equality on the not-less-than path. |
248 | // |
249 | // Ignore median and split: do a less-than comparison on a value that splits the cases in two |
250 | // equal-sized halves. Recurse left or right based on the comparison. Do not test for equality |
251 | // against the median (or anything else); let the recursion handle those equality comparisons |
252 | // once we bottom out in a list that case 3 cases or less (see above). |
253 | // |
254 | // I'll refer to these strategies as Isolate and Ignore. I initially believed that Isolate |
255 | // would be faster since it leads to less branching for some lucky cases. It turns out that |
256 | // Isolate is almost a total fail in the average, assuming all cases are equally likely. How |
257 | // bad Isolate is depends on whether you believe that doing two consecutive branches based on |
258 | // the same comparison is cheaper than doing the compare/branches separately. This is |
259 | // difficult to evaluate. For small immediates that aren't blinded, we just care about |
260 | // avoiding a second compare instruction. For large immediates or when blinding is in play, we |
261 | // also care about the instructions used to materialize the immediate a second time. Isolate |
262 | // can help with both costs since it involves first doing a < compare+branch on some value, |
263 | // followed by a == compare+branch on the same exact value (or vice-versa). Ignore will do a < |
264 | // compare+branch on some value, and then the == compare+branch on that same value will happen |
265 | // much later. |
266 | // |
267 | // To evaluate these costs, I wrote the recurrence relation for Isolate and Ignore, assuming |
268 | // that ComparisonCost is the cost of a compare+branch and ChainedComparisonCost is the cost |
269 | // of a compare+branch on some value that you've just done another compare+branch for. These |
270 | // recurrence relations compute the total cost incurred if you executed the switch statement |
271 | // on each matching value. So the average cost of hitting some case can be computed as |
272 | // Isolate[n]/n or Ignore[n]/n, respectively for the two relations. |
273 | // |
274 | // Isolate[1] = ComparisonCost |
275 | // Isolate[2] = (2 + 1) * ComparisonCost |
276 | // Isolate[3] = (3 + 2 + 1) * ComparisonCost |
277 | // Isolate[n_] := With[ |
278 | // {medianIndex = Floor[n/2] + If[EvenQ[n], RandomInteger[], 1]}, |
279 | // ComparisonCost + ChainedComparisonCost + |
280 | // (ComparisonCost * (medianIndex - 1) + Isolate[medianIndex - 1]) + |
281 | // (2 * ComparisonCost * (n - medianIndex) + Isolate[n - medianIndex])] |
282 | // |
283 | // Ignore[1] = ComparisonCost |
284 | // Ignore[2] = (2 + 1) * ComparisonCost |
285 | // Ignore[3] = (3 + 2 + 1) * ComparisonCost |
286 | // Ignore[n_] := With[ |
287 | // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]}, |
288 | // (medianIndex * ComparisonCost + Ignore[medianIndex]) + |
289 | // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])] |
290 | // |
291 | // This does not account for the average cost of hitting the default case. See further below |
292 | // for a discussion of that. |
293 | // |
294 | // It turns out that for ComparisonCost = 1 and ChainedComparisonCost = 1, Ignore is always |
295 | // better than Isolate. If we assume that ChainedComparisonCost = 0, then Isolate wins for |
296 | // switch statements that have 20 cases or fewer, though the margin of victory is never large |
297 | // - it might sometimes save an average of 0.3 ComparisonCost. For larger switch statements, |
298 | // we see divergence between the two with Ignore winning. This is of course rather |
299 | // unrealistic since the chained comparison is never free. For ChainedComparisonCost = 0.5, we |
300 | // see Isolate winning for 10 cases or fewer, by maybe 0.2 ComparisonCost. Again we see |
301 | // divergence for large switches with Ignore winning, for example if a switch statement has |
302 | // 100 cases then Ignore saves one branch on average. |
303 | // |
304 | // Our current JIT backends don't provide for optimization for chained comparisons, except for |
305 | // reducing the code for materializing the immediate if the immediates are large or blinding |
306 | // comes into play. Probably our JIT backends live somewhere north of |
307 | // ChainedComparisonCost = 0.5. |
308 | // |
309 | // This implies that using the Ignore strategy is likely better. If we wanted to incorporate |
310 | // the Isolate strategy, we'd want to determine the switch size threshold at which the two |
311 | // cross over and then use Isolate for switches that are smaller than that size. |
312 | // |
313 | // The average cost of hitting the default case is similar, but involves a different cost for |
314 | // the base cases: you have to assume that you will always fail each branch. For the Ignore |
315 | // strategy we would get this recurrence relation; the same kind of thing happens to the |
316 | // Isolate strategy: |
317 | // |
318 | // Ignore[1] = ComparisonCost |
319 | // Ignore[2] = (2 + 2) * ComparisonCost |
320 | // Ignore[3] = (3 + 3 + 3) * ComparisonCost |
321 | // Ignore[n_] := With[ |
322 | // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]}, |
323 | // (medianIndex * ComparisonCost + Ignore[medianIndex]) + |
324 | // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])] |
325 | // |
326 | // This means that if we cared about the default case more, we would likely reduce |
327 | // leafThreshold. Reducing it to 2 would reduce the average cost of the default case by 1/3 |
328 | // in the most extreme cases (num switch cases = 3, 6, 12, 24, ...). But it would also |
329 | // increase the average cost of taking one of the non-default cases by 1/3. Typically the |
330 | // difference is 1/6 in either direction. This makes it a very simple trade-off: if we believe |
331 | // that the default case is more important then we would want leafThreshold to be 2, and the |
332 | // default case would become 1/6 faster on average. But we believe that most switch statements |
333 | // are more likely to take one of the cases than the default, so we use leafThreshold = 3 |
334 | // and get a 1/6 speed-up on average for taking an explicit case. |
335 | |
336 | unsigned medianIndex = (start + end) / 2; |
337 | |
338 | if (BinarySwitchInternal::verbose) |
339 | dataLog("medianIndex = " , medianIndex, "\n" ); |
340 | |
341 | // We want medianIndex to point to the thing we will do a less-than compare against. We want |
342 | // this less-than compare to split the current sublist into equal-sized sublists, or |
343 | // nearly-equal-sized with some randomness if we're in the odd case. With the above |
344 | // calculation, in the odd case we will have medianIndex pointing at either the element we |
345 | // want or the element to the left of the one we want. Consider the case of five elements: |
346 | // |
347 | // 0 1 2 3 4 |
348 | // |
349 | // start will be 0, end will be 5. The average is 2.5, which rounds down to 2. If we do |
350 | // value < 2, then we will split the list into 2 elements on the left and three on the right. |
351 | // That's pretty good, but in this odd case we'd like to at random choose 3 instead to ensure |
352 | // that we don't become unbalanced on the right. This does not improve throughput since one |
353 | // side will always get shafted, and that side might still be odd, in which case it will also |
354 | // have two sides and one of them will get shafted - and so on. We just want to avoid |
355 | // deterministic pathologies. |
356 | // |
357 | // In the even case, we will always end up pointing at the element we want: |
358 | // |
359 | // 0 1 2 3 |
360 | // |
361 | // start will be 0, end will be 4. So, the average is 2, which is what we'd like. |
362 | if (size & 1) { |
363 | RELEASE_ASSERT(medianIndex - start + 1 == end - medianIndex); |
364 | medianIndex += m_weakRandom.getUint32() & 1; |
365 | } else |
366 | RELEASE_ASSERT(medianIndex - start == end - medianIndex); |
367 | |
368 | RELEASE_ASSERT(medianIndex > start); |
369 | RELEASE_ASSERT(medianIndex + 1 < end); |
370 | |
371 | if (BinarySwitchInternal::verbose) |
372 | dataLog("fixed medianIndex = " , medianIndex, "\n" ); |
373 | |
374 | append(BranchCode(LessThanToPush, medianIndex)); |
375 | build(medianIndex, true, end); |
376 | append(BranchCode(Pop)); |
377 | build(start, hardStart, medianIndex); |
378 | } |
379 | |
380 | void BinarySwitch::Case::dump(PrintStream& out) const |
381 | { |
382 | out.print("<value: " , value, ", index: " , index, ">" ); |
383 | } |
384 | |
385 | void BinarySwitch::BranchCode::dump(PrintStream& out) const |
386 | { |
387 | switch (kind) { |
388 | case NotEqualToFallThrough: |
389 | out.print("NotEqualToFallThrough" ); |
390 | break; |
391 | case NotEqualToPush: |
392 | out.print("NotEqualToPush" ); |
393 | break; |
394 | case LessThanToPush: |
395 | out.print("LessThanToPush" ); |
396 | break; |
397 | case Pop: |
398 | out.print("Pop" ); |
399 | break; |
400 | case ExecuteCase: |
401 | out.print("ExecuteCase" ); |
402 | break; |
403 | } |
404 | |
405 | if (index != UINT_MAX) |
406 | out.print("(" , index, ")" ); |
407 | } |
408 | |
409 | } // namespace JSC |
410 | |
411 | #endif // ENABLE(JIT) |
412 | |
413 | |