1 | /* |
2 | * Copyright (C) 2019 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #include "config.h" |
27 | #include "B3ReduceLoopStrength.h" |
28 | |
29 | #if ENABLE(B3_JIT) |
30 | |
31 | #include "B3BasicBlockInlines.h" |
32 | #include "B3BlockInsertionSet.h" |
33 | #include "B3ConstPtrValue.h" |
34 | #include "B3EnsureLoopPreHeaders.h" |
35 | #include "B3InsertionSet.h" |
36 | #include "B3NaturalLoops.h" |
37 | #include "B3PhaseScope.h" |
38 | #include "B3ProcedureInlines.h" |
39 | #include "B3ValueInlines.h" |
40 | #include "B3Variable.h" |
41 | #include "B3VariableValue.h" |
42 | #include <wtf/GraphNodeWorklist.h> |
43 | #include <wtf/IndexSet.h> |
44 | #include <wtf/SmallPtrSet.h> |
45 | #include <wtf/Vector.h> |
46 | |
47 | namespace JSC { namespace B3 { |
48 | |
49 | #if CPU(X86_64) |
50 | /* |
51 | * The semantics of B3 require that loads and stores are not reduced in granularity. |
52 | * If we would use system memcpy, then it is possible that we would get a byte copy loop, |
53 | * and this could cause GC tearing. |
54 | */ |
55 | void fastForwardCopy32(uint32_t* dst, const uint32_t* src, size_t size) |
56 | { |
57 | uint64_t i; |
58 | int64_t counter; |
59 | uint64_t tmp, tmp2; |
60 | |
61 | asm volatile( |
62 | "test %q[size], %q[size]\t\n" |
63 | "jz 900f\t\n" // exit |
64 | "xorq %q[i], %q[i]\t\n" |
65 | // if size < 8 |
66 | "cmp $8, %q[size]\t\n" |
67 | "jl 100f\t\n" // backporch |
68 | // if (dst + size*4 <= src || src + size*4 <= dst) |
69 | "leaq (%q[src], %q[size], 4), %q[tmp]\t\n" |
70 | "cmp %q[tmp], %q[dst]\t\n" |
71 | "jae 500f\t\n" // simd no overlap |
72 | "leaq (%q[dst], %q[size], 4), %q[tmp]\t\n" |
73 | "cmp %q[tmp], %q[src]\t\n" |
74 | "jae 500f\t\n" // simd no overlap |
75 | "jmp 100f\t\n" // Backporch |
76 | |
77 | // Backporch (assumes we can write at least one int) |
78 | "100:\t\n" |
79 | // counter = (size - i)%4 |
80 | "mov %q[size], %q[counter]\t\n" |
81 | "subq %q[i], %q[counter]\t\n" |
82 | |
83 | // If we are a multiple of 4, unroll the loop |
84 | // We already know we are nonzero |
85 | "and $3, %q[counter]\t\n" |
86 | "jz 200f\t\n" // unrolled loop |
87 | |
88 | "negq %q[counter]\t\n" |
89 | ".balign 32\t\n" |
90 | "101:\t\n" |
91 | "mov (%q[src], %q[i], 4), %k[tmp]\t\n" |
92 | "mov %k[tmp], (%q[dst], %q[i], 4)\t\n" |
93 | "incq %q[i]\t\n" |
94 | "incq %q[counter]\t\n" |
95 | "jnz 101b\t\n" |
96 | // Do we still have anything left? |
97 | "cmpq %q[size], %q[i]\t\n" |
98 | "je 900f\t\n" // exit |
99 | // Then go to the unrolled loop |
100 | "jmp 200f\t\n" |
101 | |
102 | // Unrolled loop (assumes multiple of 4, can do one iteration) |
103 | "200:\t\n" |
104 | "shr $2, %q[counter]\t\n" |
105 | "negq %q[counter]\t\n" |
106 | ".balign 32\t\n" |
107 | "201:\t\n" |
108 | "mov (%q[src], %q[i], 4), %k[tmp]\t\n" |
109 | "mov %k[tmp], (%q[dst], %q[i], 4)\t\n" |
110 | "mov 4(%q[src], %q[i], 4), %k[tmp2]\t\n" |
111 | "mov %k[tmp2], 4(%q[dst], %q[i], 4)\t\n" |
112 | "mov 8(%q[src], %q[i], 4), %k[tmp2]\t\n" |
113 | "mov %k[tmp2], 8(%q[dst], %q[i], 4)\t\n" |
114 | "mov 12(%q[src], %q[i], 4), %k[tmp]\t\n" |
115 | "mov %k[tmp], 12(%q[dst], %q[i], 4)\t\n" |
116 | "addq $4, %q[i]\t\n" |
117 | "cmpq %q[size], %q[i]\t\n" |
118 | "jb 201b\t\n" |
119 | "jmp 900f\t\n" // exit |
120 | |
121 | // simd no overlap |
122 | // counter = -(size/8); |
123 | "500:\t\n" |
124 | "mov %q[size], %q[counter]\t\n" |
125 | "shrq $3, %q[counter]\t\n" |
126 | "negq %q[counter]\t\n" |
127 | ".balign 32\t\n" |
128 | "501:\t\n" |
129 | "movups (%q[src], %q[i], 4), %%xmm0\t\n" |
130 | "movups 16(%q[src], %q[i], 4), %%xmm1\t\n" |
131 | "movups %%xmm0, (%q[dst], %q[i], 4)\t\n" |
132 | "movups %%xmm1, 16(%q[dst], %q[i], 4)\t\n" |
133 | "addq $8, %q[i]\t\n" |
134 | "incq %q[counter]\t\n" |
135 | "jnz 501b\t\n" |
136 | // if (i == size) |
137 | "cmpq %q[size], %q[i]\t\n" |
138 | "je 900f\t\n" // end |
139 | "jmp 100b\t\n" // backporch |
140 | "900:\t\n" |
141 | : [i] "=&c" (i), [counter] "=&r" (counter), [tmp] "=&a" (tmp), [tmp2] "=&r" (tmp2) |
142 | : [dst] "D" (dst), [src] "S" (src), [size] "r" (size) |
143 | : "xmm0" , "xmm1" ); |
144 | } |
145 | #endif |
146 | |
147 | class ReduceLoopStrength { |
148 | static constexpr bool verbose = false; |
149 | |
150 | struct AddrInfo { |
151 | Value* appendAddr(Procedure& proc, BasicBlock* block, Value* addr) |
152 | { |
153 | block->appendNew<Value>(proc, Identity, addr->origin(), addr); |
154 | return addr; |
155 | } |
156 | |
157 | Value* insideLoopAddr; |
158 | |
159 | Value* arrayBase; |
160 | Value* index; |
161 | }; |
162 | |
163 | public: |
164 | ReduceLoopStrength(Procedure& proc) |
165 | : m_proc(proc) |
166 | , m_insertionSet(proc) |
167 | , m_blockInsertionSet(proc) |
168 | , m_root(proc.at(0)) |
169 | , m_phiChildren(proc) |
170 | { |
171 | } |
172 | |
173 | #if CPU(X86_64) |
174 | void reduceByteCopyLoopsToMemcpy() |
175 | { |
176 | HashSet<Value*> patternMatchedValues; |
177 | |
178 | Value* store = m_value; |
179 | if (store->opcode() != Store) |
180 | return; |
181 | patternMatchedValues.add(store); |
182 | |
183 | Value* load = store->child(0); |
184 | uint32_t elemSize = sizeofType(load->type()); |
185 | if (load->opcode() != Load || elemSize != 4) |
186 | return; |
187 | patternMatchedValues.add(load); |
188 | |
189 | if (verbose) |
190 | dataLogLn("Found store/load:" , *store); |
191 | |
192 | auto = [&] (Value* value) -> Optional<AddrInfo> { |
193 | patternMatchedValues.add(value); |
194 | |
195 | Optional<AddrInfo> addr = { AddrInfo() }; |
196 | |
197 | Value* sum = value; |
198 | if (value->opcode() == ZExt32) |
199 | sum = sum->child(0); |
200 | |
201 | if (sum->opcode() != Add || sum->numChildren() != 2) |
202 | return { }; |
203 | |
204 | addr->insideLoopAddr = sum; |
205 | auto = [&] (Value* value) -> Value* { |
206 | // Match arrBase[i*elemSize] |
207 | if (value->opcode() == Shl |
208 | && value->child(0)->opcode() == Phi |
209 | && value->child(1)->isInt(WTF::fastLog2(elemSize))) |
210 | return value->child(0); |
211 | if (value->opcode() == Shl |
212 | && value->child(0)->opcode() == ZExt32 |
213 | && value->child(0)->child(0)->opcode() == Phi |
214 | && value->child(1)->isInt(WTF::fastLog2(elemSize))) |
215 | return value->child(0)->child(0); |
216 | return nullptr; |
217 | }; |
218 | |
219 | // Try to find a known pattern applied to a single phi, which we can assume is our loop index. |
220 | Value* left = extractLoopIndexInSubtree(sum->child(0)); |
221 | Value* right = extractLoopIndexInSubtree(sum->child(1)); |
222 | if (left && !right) { |
223 | addr->index = left; |
224 | addr->arrayBase = sum->child(1); |
225 | } else if (right && !left) { |
226 | addr->index = right; |
227 | addr->arrayBase = sum->child(0); |
228 | } else |
229 | return { }; |
230 | |
231 | patternMatchedValues.add(addr->index); |
232 | patternMatchedValues.add(addr->arrayBase); |
233 | |
234 | return addr; |
235 | }; |
236 | |
237 | auto destination = extractArrayInfo(store->child(1)); |
238 | auto source = extractArrayInfo(load->child(0)); |
239 | |
240 | if (!source || !destination || source->index != destination->index) |
241 | return; |
242 | |
243 | if (destination->arrayBase->type() != Int64 || source->arrayBase->type() != Int64) |
244 | return; |
245 | |
246 | if (verbose) |
247 | dataLogLn("Found array info: " , *source->arrayBase, "[" , *source->index, "] = " , *destination->arrayBase, "[" , *destination->index, "]" ); |
248 | |
249 | // Find the loop header, footer and inner blocks. |
250 | // Verify that there is one entrance to and one exit from the loop. |
251 | auto* loop = m_proc.naturalLoops().innerMostLoopOf(m_block); |
252 | if (!loop) |
253 | return; |
254 | BasicBlock* loopHead = loop->header(); |
255 | BasicBlock* = nullptr; |
256 | BasicBlock* = nullptr; |
257 | BasicBlock* = nullptr; |
258 | SmallPtrSet<BasicBlock*> loopInnerBlocks; |
259 | |
260 | { |
261 | for (unsigned i = 0; i < loop->size(); ++i) |
262 | loopInnerBlocks.add(loop->at(i)); |
263 | |
264 | if (!loopInnerBlocks.contains(load->owner)) |
265 | return; |
266 | |
267 | if (!loopInnerBlocks.contains(store->owner)) |
268 | return; |
269 | |
270 | SmallPtrSet<BasicBlock*> loopEntrances; |
271 | SmallPtrSet<BasicBlock*> loopExits; |
272 | for (auto* block : loopInnerBlocks) { |
273 | for (auto successor : block->successors()) { |
274 | if (!loopInnerBlocks.contains(successor.block())) |
275 | loopExits.add(successor.block()); |
276 | } |
277 | for (auto* predecessor : block->predecessors()) { |
278 | if (!loopInnerBlocks.contains(predecessor)) |
279 | loopEntrances.add(predecessor); |
280 | } |
281 | } |
282 | |
283 | if (loopExits.size() != 1) { |
284 | if (verbose) { |
285 | dataLog("Loop has incorrect number of exits: " ); |
286 | for (BasicBlock* block : loopExits) |
287 | dataLog(*block, " " ); |
288 | dataLogLn(); |
289 | } |
290 | return; |
291 | } |
292 | loopPostfooter = *loopExits.begin(); |
293 | |
294 | if (loopEntrances.size() != 1) { |
295 | if (verbose) { |
296 | dataLog("Loop has incorrect number of entrances: " ); |
297 | for (BasicBlock* block : loopEntrances) |
298 | dataLog(*block, " " ); |
299 | dataLogLn(); |
300 | } |
301 | return; |
302 | } |
303 | loopPreheader = *loopEntrances.begin(); |
304 | |
305 | if (!loopHead->predecessors().contains(loopPreheader)) { |
306 | if (verbose) |
307 | dataLogLn("Loop head does not follow preheader" ); |
308 | return; |
309 | } |
310 | |
311 | for (BasicBlock* predecessor : loopPostfooter->predecessors()) { |
312 | if (loopInnerBlocks.contains(predecessor)) { |
313 | // There is only one loop exit. |
314 | RELEASE_ASSERT(!loopFoot); |
315 | loopFoot = predecessor; |
316 | } |
317 | } |
318 | } |
319 | |
320 | if (verbose) { |
321 | dataLog("Found loop head:" , *loopHead, " foot:" , *loopFoot, " prehead " , *loopPreheader, " postfoot " , *loopPostfooter, " inner blocks: " ); |
322 | for (BasicBlock* block : loopInnerBlocks) |
323 | dataLog(*block, " " ); |
324 | dataLogLn(); |
325 | } |
326 | |
327 | // Verify that the index is a monotonic inductive variable. |
328 | Value* startingIndex = nullptr; |
329 | { |
330 | if (m_phiChildren.at(source->index).size() != 2) |
331 | return; |
332 | |
333 | Value* updateIndex = nullptr; |
334 | for (Value* up : m_phiChildren.at(source->index)) { |
335 | if (m_proc.dominators().dominates(up->owner, loopPreheader)) |
336 | startingIndex = up; |
337 | else |
338 | updateIndex = up; |
339 | } |
340 | |
341 | if (!updateIndex || !startingIndex || !loopInnerBlocks.contains(updateIndex->owner)) |
342 | return; |
343 | patternMatchedValues.add(updateIndex); |
344 | patternMatchedValues.add(startingIndex); |
345 | |
346 | updateIndex = updateIndex->child(0); |
347 | startingIndex = startingIndex->child(0); |
348 | if (updateIndex->opcode() != Add || !updateIndex->child(1)->isInt(1) || updateIndex->child(0) != source->index) |
349 | return; |
350 | |
351 | if (!startingIndex->isInt(0)) |
352 | return; |
353 | } |
354 | |
355 | if (verbose) |
356 | dataLogLn("Found starting index:" , *startingIndex); |
357 | |
358 | // Identify the loop exit condition. |
359 | Value* exitCondition = nullptr; |
360 | // Is this an exit condition or a continuation condition? |
361 | bool conditionInverted = false; |
362 | // Do we check the index before or after using it? |
363 | bool checkBeforeWrite = false; |
364 | { |
365 | Value* branch = loopFoot->last(); |
366 | if (branch->opcode() != Branch) |
367 | return; |
368 | patternMatchedValues.add(branch); |
369 | exitCondition = branch->child(0); |
370 | |
371 | conditionInverted = loopPostfooter != loopFoot->successor(0).block(); |
372 | checkBeforeWrite = m_proc.dominators().strictlyDominates(loopFoot, m_block); |
373 | } |
374 | |
375 | if (verbose) |
376 | dataLogLn("Found exit condition: " , *exitCondition, " inverted: " , conditionInverted, " checks before the first write: " , checkBeforeWrite); |
377 | |
378 | // Idenfity the loop bound by matching loop bound expressions. |
379 | Value* loopBound = nullptr; |
380 | { |
381 | auto = [&] (Value* index, Value* bound) -> Value* { |
382 | // Match i+1 when we do a write first followed by the check for the next iteration |
383 | if (!checkBeforeWrite && index->opcode() == Add && index->child(0) == source->index && index->child(1)->isInt(1)) |
384 | return bound; |
385 | return nullptr; |
386 | }; |
387 | |
388 | if (exitCondition->opcode() == GreaterThan && conditionInverted) { |
389 | // enter loop again if size > i |
390 | loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0)); |
391 | } else if (exitCondition->opcode() == LessThan && !conditionInverted) { |
392 | // enter loop again if size < i |
393 | loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0)); |
394 | } |
395 | |
396 | if (!loopBound) { |
397 | if (verbose) |
398 | dataLogLn("Unable to extract bound from exit condition " , *exitCondition); |
399 | return; |
400 | } |
401 | } |
402 | |
403 | // Make sure there are no other effectful things happening. |
404 | // This also guarantees that we found the only branch in the loop. This means that our |
405 | // load/store must happen iff our index update and branch do. |
406 | for (BasicBlock* block : loopInnerBlocks) { |
407 | for (unsigned index = 0; index < block->size(); ++index) { |
408 | Value* value = block->at(index); |
409 | if (patternMatchedValues.contains(value) || value->opcode() == Jump) |
410 | continue; |
411 | |
412 | Effects effects = value->effects(); |
413 | effects.readsLocalState = false; |
414 | if (effects != Effects::none()) { |
415 | if (verbose) |
416 | dataLogLn("Not byte copy loop, node " , *value, " has effects" ); |
417 | return; |
418 | } |
419 | } |
420 | } |
421 | |
422 | if (!hoistValue(loopPreheader, source->arrayBase, false) |
423 | || !hoistValue(loopPreheader, destination->arrayBase, false) |
424 | || !hoistValue(loopPreheader, loopBound, false)) |
425 | return; |
426 | |
427 | if (verbose) |
428 | dataLogLn("Found byte copy loop: " , *source->arrayBase, "[" , *source->index, "] = " , *destination->arrayBase, "[" , *destination->index, "] from index " , *startingIndex, " to max index " , *loopBound, " stride: " , elemSize); |
429 | |
430 | bool hoistResult = hoistValue(loopPreheader, source->arrayBase); |
431 | hoistResult &= hoistValue(loopPreheader, destination->arrayBase); |
432 | hoistResult &= hoistValue(loopPreheader, loopBound); |
433 | ASSERT_UNUSED(hoistResult, hoistResult); |
434 | |
435 | auto origin = loopPostfooter->at(0)->origin(); |
436 | |
437 | BasicBlock* memcpy = m_blockInsertionSet.insertBefore(loopPostfooter, loopPostfooter->frequency()); |
438 | memcpy->setSuccessors(FrequentedBlock(loopPostfooter)); |
439 | memcpy->addPredecessor(loopPreheader); |
440 | for (BasicBlock* pred : loopPostfooter->predecessors()) |
441 | loopPostfooter->removePredecessor(pred); |
442 | loopPostfooter->addPredecessor(memcpy); |
443 | loopPreheader->setSuccessors(memcpy); |
444 | |
445 | Effects effects = Effects::forCall(); |
446 | memcpy->appendNew<CCallValue>(m_proc, B3::Void, origin, effects, |
447 | memcpy->appendNew<ConstPtrValue>(m_proc, origin, tagCFunctionPtr<void*>(fastForwardCopy32, B3CCallPtrTag)), |
448 | destination->appendAddr(m_proc, memcpy, destination->arrayBase), |
449 | source->appendAddr(m_proc, memcpy, source->arrayBase), |
450 | loopBound); |
451 | |
452 | memcpy->appendNew<Value>(m_proc, Jump, origin); |
453 | } |
454 | |
455 | bool hoistValue(BasicBlock* into, Value* value, bool canMutate = true) |
456 | { |
457 | if (m_proc.dominators().dominates(value->owner, into)) |
458 | return true; |
459 | |
460 | Effects effects = value->effects(); |
461 | if (effects != Effects::none()) { |
462 | if (verbose) |
463 | dataLogLn("Cannot hoist " , *value, " into " , *into, " since it has effects" ); |
464 | return false; |
465 | } |
466 | |
467 | for (Value* child : value->children()) { |
468 | if (!hoistValue(into, child, false)) |
469 | return false; |
470 | } |
471 | |
472 | if (canMutate) { |
473 | for (Value* child : value->children()) |
474 | hoistValue(into, child); |
475 | |
476 | value->owner->at(value->index()) = m_proc.add<Value>(Nop, Void, value->origin()); |
477 | into->appendNonTerminal(value); |
478 | } |
479 | |
480 | return true; |
481 | } |
482 | |
483 | bool run() |
484 | { |
485 | if (verbose) |
486 | dataLogLn("ReduceLoopStrength examining proc: " , m_proc); |
487 | bool result = false; |
488 | |
489 | do { |
490 | m_changed = false; |
491 | |
492 | for (BasicBlock* block : m_proc.blocksInPreOrder()) { |
493 | m_block = block; |
494 | |
495 | for (m_index = 0; m_index < block->size(); ++m_index) { |
496 | m_value = m_block->at(m_index); |
497 | m_value->performSubstitution(); |
498 | reduceByteCopyLoopsToMemcpy(); |
499 | m_insertionSet.execute(m_block); |
500 | m_changed |= m_blockInsertionSet.execute(); |
501 | |
502 | if (m_changed) { |
503 | m_proc.resetReachability(); |
504 | m_proc.invalidateCFG(); |
505 | m_proc.resetValueOwners(); |
506 | result = true; |
507 | break; |
508 | } |
509 | } |
510 | |
511 | if (m_changed) |
512 | break; |
513 | } |
514 | } while (m_changed && m_proc.optLevel() >= 2); |
515 | |
516 | if (result && verbose) |
517 | dataLogLn("ReduceLoopStrength produced proc: " , m_proc); |
518 | |
519 | return result; |
520 | } |
521 | #else |
522 | bool run() |
523 | { |
524 | return false; |
525 | } |
526 | #endif |
527 | |
528 | Procedure& m_proc; |
529 | InsertionSet m_insertionSet; |
530 | BlockInsertionSet m_blockInsertionSet; |
531 | BasicBlock* m_root { nullptr }; |
532 | BasicBlock* m_block { nullptr }; |
533 | unsigned m_index { 0 }; |
534 | Value* m_value { nullptr }; |
535 | bool m_changed { false }; |
536 | PhiChildren m_phiChildren; |
537 | }; |
538 | |
539 | bool reduceLoopStrength(Procedure& proc) |
540 | { |
541 | PhaseScope phaseScope(proc, "reduceLoopStrength" ); |
542 | return ReduceLoopStrength(proc).run(); |
543 | } |
544 | |
545 | } } // namespace JSC::B3 |
546 | |
547 | #endif // ENABLE(B3_JIT) |
548 | |