1/*
2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrInterpreter.h"
29
30#include "Options.h"
31#include "SuperSampler.h"
32#include "Yarr.h"
33#include "YarrCanonicalize.h"
34#include <wtf/BumpPointerAllocator.h>
35#include <wtf/CheckedArithmetic.h>
36#include <wtf/DataLog.h>
37#include <wtf/text/CString.h>
38#include <wtf/text/WTFString.h>
39
40namespace JSC { namespace Yarr {
41
42template<typename CharType>
43class Interpreter {
44public:
45 struct ParenthesesDisjunctionContext;
46
47 struct BackTrackInfoParentheses {
48 uintptr_t matchAmount;
49 ParenthesesDisjunctionContext* lastContext;
50 };
51
52 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
53 {
54 context->next = backTrack->lastContext;
55 backTrack->lastContext = context;
56 ++backTrack->matchAmount;
57 }
58
59 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
60 {
61 RELEASE_ASSERT(backTrack->matchAmount);
62 RELEASE_ASSERT(backTrack->lastContext);
63 backTrack->lastContext = backTrack->lastContext->next;
64 --backTrack->matchAmount;
65 }
66
67 struct DisjunctionContext
68 {
69 DisjunctionContext() = default;
70
71 void* operator new(size_t, void* where)
72 {
73 return where;
74 }
75
76 static size_t allocationSize(unsigned numberOfFrames)
77 {
78 static_assert(alignof(DisjunctionContext) <= sizeof(void*), "");
79 size_t rawSize = (sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t)).unsafeGet();
80 size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
81 RELEASE_ASSERT(roundedSize >= rawSize);
82 return roundedSize;
83 }
84
85 int term { 0 };
86 unsigned matchBegin;
87 unsigned matchEnd;
88 uintptr_t frame[1];
89 };
90
91 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
92 {
93 size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize);
94 allocatorPool = allocatorPool->ensureCapacity(size);
95 RELEASE_ASSERT(allocatorPool);
96 return new (allocatorPool->alloc(size)) DisjunctionContext();
97 }
98
99 void freeDisjunctionContext(DisjunctionContext* context)
100 {
101 allocatorPool = allocatorPool->dealloc(context);
102 }
103
104 struct ParenthesesDisjunctionContext
105 {
106 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
107 {
108 unsigned firstSubpatternId = term.atom.subpatternId;
109 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
110
111 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
112 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
113 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
114 }
115
116 new (getDisjunctionContext(term)) DisjunctionContext();
117 }
118
119 void* operator new(size_t, void* where)
120 {
121 return where;
122 }
123
124 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
125 {
126 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
127 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
128 }
129
130 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
131 {
132 return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns));
133 }
134
135 static size_t allocationSize(unsigned numberOfSubpatterns)
136 {
137 static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*), "");
138 size_t rawSize = (sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (Checked<size_t>(numberOfSubpatterns) * 2U) * sizeof(unsigned)).unsafeGet();
139 size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
140 RELEASE_ASSERT(roundedSize >= rawSize);
141 return roundedSize;
142 }
143
144 ParenthesesDisjunctionContext* next { nullptr };
145 unsigned subpatternBackup[1];
146 };
147
148 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
149 {
150 size_t size = (Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)) + DisjunctionContext::allocationSize(disjunction->m_frameSize)).unsafeGet();
151 allocatorPool = allocatorPool->ensureCapacity(size);
152 RELEASE_ASSERT(allocatorPool);
153 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
154 }
155
156 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
157 {
158 allocatorPool = allocatorPool->dealloc(context);
159 }
160
161 class InputStream {
162 public:
163 InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs)
164 : input(input)
165 , pos(start)
166 , length(length)
167 , decodeSurrogatePairs(decodeSurrogatePairs)
168 {
169 }
170
171 void next()
172 {
173 ++pos;
174 }
175
176 void rewind(unsigned amount)
177 {
178 ASSERT(pos >= amount);
179 pos -= amount;
180 }
181
182 int read()
183 {
184 ASSERT(pos < length);
185 if (pos < length)
186 return input[pos];
187 return -1;
188 }
189
190 int readPair()
191 {
192 ASSERT(pos + 1 < length);
193 return input[pos] | input[pos + 1] << 16;
194 }
195
196 int readChecked(unsigned negativePositionOffest)
197 {
198 RELEASE_ASSERT(pos >= negativePositionOffest);
199 unsigned p = pos - negativePositionOffest;
200 ASSERT(p < length);
201 int result = input[p];
202 if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) {
203 if (atEnd())
204 return -1;
205
206 result = U16_GET_SUPPLEMENTARY(result, input[p + 1]);
207 next();
208 }
209 return result;
210 }
211
212 int readSurrogatePairChecked(unsigned negativePositionOffset)
213 {
214 RELEASE_ASSERT(pos >= negativePositionOffset);
215 unsigned p = pos - negativePositionOffset;
216 ASSERT(p < length);
217 if (p + 1 >= length)
218 return -1;
219
220 int first = input[p];
221 int second = input[p + 1];
222 if (U16_IS_LEAD(first) && U16_IS_TRAIL(second))
223 return U16_GET_SUPPLEMENTARY(first, second);
224
225 return -1;
226 }
227
228 int reread(unsigned from)
229 {
230 ASSERT(from < length);
231 int result = input[from];
232 if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1]))
233 result = U16_GET_SUPPLEMENTARY(result, input[from + 1]);
234 return result;
235 }
236
237 int prev()
238 {
239 ASSERT(!(pos > length));
240 if (pos && length)
241 return input[pos - 1];
242 return -1;
243 }
244
245 unsigned getPos()
246 {
247 return pos;
248 }
249
250 void setPos(unsigned p)
251 {
252 pos = p;
253 }
254
255 bool atStart()
256 {
257 return pos == 0;
258 }
259
260 bool atEnd()
261 {
262 return pos == length;
263 }
264
265 unsigned end()
266 {
267 return length;
268 }
269
270 bool checkInput(unsigned count)
271 {
272 if (((pos + count) <= length) && ((pos + count) >= pos)) {
273 pos += count;
274 return true;
275 }
276 return false;
277 }
278
279 void uncheckInput(unsigned count)
280 {
281 RELEASE_ASSERT(pos >= count);
282 pos -= count;
283 }
284
285 bool atStart(unsigned negativePositionOffset)
286 {
287 return pos == negativePositionOffset;
288 }
289
290 bool atEnd(unsigned negativePositionOffest)
291 {
292 RELEASE_ASSERT(pos >= negativePositionOffest);
293 return (pos - negativePositionOffest) == length;
294 }
295
296 bool isAvailableInput(unsigned offset)
297 {
298 return (((pos + offset) <= length) && ((pos + offset) >= pos));
299 }
300
301 private:
302 const CharType* input;
303 unsigned pos;
304 unsigned length;
305 bool decodeSurrogatePairs;
306 };
307
308 bool testCharacterClass(CharacterClass* characterClass, int ch)
309 {
310 auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) {
311 for (unsigned i = 0; i < matches.size(); ++i) {
312 if (ch == matches[i])
313 return true;
314 }
315
316 return false;
317 };
318
319 auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) {
320 size_t low = 0;
321 size_t high = matches.size() - 1;
322
323 while (low <= high) {
324 size_t mid = low + (high - low) / 2;
325 int diff = ch - matches[mid];
326 if (!diff)
327 return true;
328
329 if (diff < 0) {
330 if (mid == low)
331 return false;
332 high = mid - 1;
333 } else
334 low = mid + 1;
335 }
336 return false;
337 };
338
339 auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
340 for (unsigned i = 0; i < ranges.size(); ++i) {
341 if ((ch >= ranges[i].begin) && (ch <= ranges[i].end))
342 return true;
343 }
344
345 return false;
346 };
347
348 auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
349 size_t low = 0;
350 size_t high = ranges.size() - 1;
351
352 while (low <= high) {
353 size_t mid = low + (high - low) / 2;
354 int rangeBeginDiff = ch - ranges[mid].begin;
355 if (rangeBeginDiff >= 0 && ch <= ranges[mid].end)
356 return true;
357
358 if (rangeBeginDiff < 0) {
359 if (mid == low)
360 return false;
361 high = mid - 1;
362 } else
363 low = mid + 1;
364 }
365 return false;
366 };
367
368 if (characterClass->m_anyCharacter)
369 return true;
370
371 const size_t thresholdForBinarySearch = 6;
372
373 if (!isASCII(ch)) {
374 if (characterClass->m_matchesUnicode.size()) {
375 if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) {
376 if (binarySearchMatches(characterClass->m_matchesUnicode))
377 return true;
378 } else if (linearSearchMatches(characterClass->m_matchesUnicode))
379 return true;
380 }
381
382 if (characterClass->m_rangesUnicode.size()) {
383 if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) {
384 if (binarySearchRanges(characterClass->m_rangesUnicode))
385 return true;
386 } else if (linearSearchRanges(characterClass->m_rangesUnicode))
387 return true;
388 }
389 } else {
390 if (characterClass->m_matches.size()) {
391 if (characterClass->m_matches.size() > thresholdForBinarySearch) {
392 if (binarySearchMatches(characterClass->m_matches))
393 return true;
394 } else if (linearSearchMatches(characterClass->m_matches))
395 return true;
396 }
397
398 if (characterClass->m_ranges.size()) {
399 if (characterClass->m_ranges.size() > thresholdForBinarySearch) {
400 if (binarySearchRanges(characterClass->m_ranges))
401 return true;
402 } else if (linearSearchRanges(characterClass->m_ranges))
403 return true;
404 }
405 }
406
407 return false;
408 }
409
410 bool checkCharacter(int testChar, unsigned negativeInputOffset)
411 {
412 return testChar == input.readChecked(negativeInputOffset);
413 }
414
415 bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset)
416 {
417 return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset);
418 }
419
420 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
421 {
422 int ch = input.readChecked(negativeInputOffset);
423 return (loChar == ch) || (hiChar == ch);
424 }
425
426 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
427 {
428 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
429 return invert ? !match : match;
430 }
431
432 bool checkCharacterClassDontAdvanceInputForNonBMP(CharacterClass* characterClass, unsigned negativeInputOffset)
433 {
434 int readCharacter = characterClass->hasOnlyNonBMPCharacters() ? input.readSurrogatePairChecked(negativeInputOffset) : input.readChecked(negativeInputOffset);
435 return testCharacterClass(characterClass, readCharacter);
436 }
437
438 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
439 {
440 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
441
442 if (!input.checkInput(matchSize))
443 return false;
444
445 for (unsigned i = 0; i < matchSize; ++i) {
446 int oldCh = input.reread(matchBegin + i);
447 int ch;
448 if (!U_IS_BMP(oldCh)) {
449 ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i);
450 ++i;
451 } else
452 ch = input.readChecked(negativeInputOffset + matchSize - i);
453
454 if (oldCh == ch)
455 continue;
456
457 if (pattern->ignoreCase()) {
458 // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode
459 // patterns, Unicode values are never allowed to match against ASCII ones.
460 // For Unicode, we need to check all canonical equivalents of a character.
461 if (!unicode && (isASCII(oldCh) || isASCII(ch))) {
462 if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
463 continue;
464 } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2))
465 continue;
466 }
467
468 input.uncheckInput(matchSize);
469 return false;
470 }
471
472 return true;
473 }
474
475 bool matchAssertionBOL(ByteTerm& term)
476 {
477 return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
478 }
479
480 bool matchAssertionEOL(ByteTerm& term)
481 {
482 if (term.inputPosition)
483 return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
484
485 return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read()));
486 }
487
488 bool matchAssertionWordBoundary(ByteTerm& term)
489 {
490 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
491 bool readIsWordchar;
492 if (term.inputPosition)
493 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
494 else
495 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
496
497 bool wordBoundary = prevIsWordchar != readIsWordchar;
498 return term.invert() ? !wordBoundary : wordBoundary;
499 }
500
501 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
502 {
503 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
504
505 switch (term.atom.quantityType) {
506 case QuantifierFixedCount:
507 break;
508
509 case QuantifierGreedy:
510 if (backTrack->matchAmount) {
511 --backTrack->matchAmount;
512 input.uncheckInput(U16_LENGTH(term.atom.patternCharacter));
513 return true;
514 }
515 break;
516
517 case QuantifierNonGreedy:
518 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
519 ++backTrack->matchAmount;
520 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
521 return true;
522 }
523 input.setPos(backTrack->begin);
524 break;
525 }
526
527 return false;
528 }
529
530 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
531 {
532 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
533
534 switch (term.atom.quantityType) {
535 case QuantifierFixedCount:
536 break;
537
538 case QuantifierGreedy:
539 if (backTrack->matchAmount) {
540 --backTrack->matchAmount;
541 input.uncheckInput(1);
542 return true;
543 }
544 break;
545
546 case QuantifierNonGreedy:
547 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
548 ++backTrack->matchAmount;
549 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
550 return true;
551 }
552 input.uncheckInput(backTrack->matchAmount);
553 break;
554 }
555
556 return false;
557 }
558
559 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
560 {
561 ASSERT(term.type == ByteTerm::TypeCharacterClass);
562 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
563
564 switch (term.atom.quantityType) {
565 case QuantifierFixedCount: {
566 if (unicode) {
567 CharacterClass* charClass = term.atom.characterClass;
568 backTrack->begin = input.getPos();
569 unsigned matchAmount = 0;
570 for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
571 if (term.invert()) {
572 if (!checkCharacterClass(charClass, term.invert(), term.inputPosition - matchAmount)) {
573 input.setPos(backTrack->begin);
574 return false;
575 }
576 } else {
577 unsigned matchOffset = matchAmount * (charClass->hasOnlyNonBMPCharacters() ? 2 : 1);
578 if (!checkCharacterClassDontAdvanceInputForNonBMP(charClass, term.inputPosition - matchOffset)) {
579 input.setPos(backTrack->begin);
580 return false;
581 }
582 }
583 }
584
585 return true;
586 }
587
588 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
589 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
590 return false;
591 }
592 return true;
593 }
594
595 case QuantifierGreedy: {
596 unsigned position = input.getPos();
597 backTrack->begin = position;
598 unsigned matchAmount = 0;
599 while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
600 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
601 input.setPos(position);
602 break;
603 }
604 ++matchAmount;
605 position = input.getPos();
606 }
607 backTrack->matchAmount = matchAmount;
608
609 return true;
610 }
611
612 case QuantifierNonGreedy:
613 backTrack->begin = input.getPos();
614 backTrack->matchAmount = 0;
615 return true;
616 }
617
618 RELEASE_ASSERT_NOT_REACHED();
619 return false;
620 }
621
622 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
623 {
624 ASSERT(term.type == ByteTerm::TypeCharacterClass);
625 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
626
627 switch (term.atom.quantityType) {
628 case QuantifierFixedCount:
629 if (unicode)
630 input.setPos(backTrack->begin);
631 break;
632
633 case QuantifierGreedy:
634 if (backTrack->matchAmount) {
635 if (unicode) {
636 // Rematch one less match
637 input.setPos(backTrack->begin);
638 --backTrack->matchAmount;
639 for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) {
640 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
641 input.uncheckInput(1);
642 break;
643 }
644 }
645 return true;
646 }
647 --backTrack->matchAmount;
648 input.uncheckInput(1);
649 return true;
650 }
651 break;
652
653 case QuantifierNonGreedy:
654 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
655 ++backTrack->matchAmount;
656 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
657 return true;
658 }
659 input.setPos(backTrack->begin);
660 break;
661 }
662
663 return false;
664 }
665
666 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
667 {
668 ASSERT(term.type == ByteTerm::TypeBackReference);
669 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
670
671 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
672 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
673
674 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
675 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
676 // Eg.: /(a\1)/
677 if (matchEnd == offsetNoMatch)
678 return true;
679
680 if (matchBegin == offsetNoMatch)
681 return true;
682
683 ASSERT(matchBegin <= matchEnd);
684
685 if (matchBegin == matchEnd)
686 return true;
687
688 switch (term.atom.quantityType) {
689 case QuantifierFixedCount: {
690 backTrack->begin = input.getPos();
691 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
692 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
693 input.setPos(backTrack->begin);
694 return false;
695 }
696 }
697 return true;
698 }
699
700 case QuantifierGreedy: {
701 unsigned matchAmount = 0;
702 while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
703 ++matchAmount;
704 backTrack->matchAmount = matchAmount;
705 return true;
706 }
707
708 case QuantifierNonGreedy:
709 backTrack->begin = input.getPos();
710 backTrack->matchAmount = 0;
711 return true;
712 }
713
714 RELEASE_ASSERT_NOT_REACHED();
715 return false;
716 }
717
718 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
719 {
720 ASSERT(term.type == ByteTerm::TypeBackReference);
721 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
722
723 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
724 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
725
726 if (matchBegin == offsetNoMatch)
727 return false;
728
729 ASSERT(matchBegin <= matchEnd);
730
731 if (matchBegin == matchEnd)
732 return false;
733
734 switch (term.atom.quantityType) {
735 case QuantifierFixedCount:
736 // for quantityMaxCount == 1, could rewind.
737 input.setPos(backTrack->begin);
738 break;
739
740 case QuantifierGreedy:
741 if (backTrack->matchAmount) {
742 --backTrack->matchAmount;
743 input.rewind(matchEnd - matchBegin);
744 return true;
745 }
746 break;
747
748 case QuantifierNonGreedy:
749 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
750 ++backTrack->matchAmount;
751 return true;
752 }
753 input.setPos(backTrack->begin);
754 break;
755 }
756
757 return false;
758 }
759
760 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
761 {
762 if (term.capture()) {
763 unsigned subpatternId = term.atom.subpatternId;
764 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition;
765 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition;
766 }
767 }
768 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
769 {
770 unsigned firstSubpatternId = term.atom.subpatternId;
771 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
772 context->restoreOutput(output, firstSubpatternId, count);
773 }
774 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
775 {
776 while (backTrack->matchAmount) {
777 ParenthesesDisjunctionContext* context = backTrack->lastContext;
778
779 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
780 if (result == JSRegExpMatch)
781 return JSRegExpMatch;
782
783 resetMatches(term, context);
784 popParenthesesDisjunctionContext(backTrack);
785 freeParenthesesDisjunctionContext(context);
786
787 if (result != JSRegExpNoMatch)
788 return result;
789 }
790
791 return JSRegExpNoMatch;
792 }
793
794 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
795 {
796 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
797 ASSERT(term.atom.quantityMaxCount == 1);
798
799 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
800
801 switch (term.atom.quantityType) {
802 case QuantifierGreedy: {
803 // set this speculatively; if we get to the parens end this will be true.
804 backTrack->begin = input.getPos();
805 break;
806 }
807 case QuantifierNonGreedy: {
808 backTrack->begin = notFound;
809 context->term += term.atom.parenthesesWidth;
810 return true;
811 }
812 case QuantifierFixedCount:
813 break;
814 }
815
816 if (term.capture()) {
817 unsigned subpatternId = term.atom.subpatternId;
818 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
819 }
820
821 return true;
822 }
823
824 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
825 {
826 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
827 ASSERT(term.atom.quantityMaxCount == 1);
828
829 if (term.capture()) {
830 unsigned subpatternId = term.atom.subpatternId;
831 output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition;
832 }
833
834 if (term.atom.quantityType == QuantifierFixedCount)
835 return true;
836
837 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
838 return backTrack->begin != input.getPos();
839 }
840
841 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
842 {
843 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
844 ASSERT(term.atom.quantityMaxCount == 1);
845
846 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
847
848 if (term.capture()) {
849 unsigned subpatternId = term.atom.subpatternId;
850 output[(subpatternId << 1)] = offsetNoMatch;
851 output[(subpatternId << 1) + 1] = offsetNoMatch;
852 }
853
854 switch (term.atom.quantityType) {
855 case QuantifierGreedy:
856 // if we backtrack to this point, there is another chance - try matching nothing.
857 ASSERT(backTrack->begin != notFound);
858 backTrack->begin = notFound;
859 context->term += term.atom.parenthesesWidth;
860 return true;
861 case QuantifierNonGreedy:
862 ASSERT(backTrack->begin != notFound);
863 FALLTHROUGH;
864 case QuantifierFixedCount:
865 break;
866 }
867
868 return false;
869 }
870
871 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
872 {
873 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
874 ASSERT(term.atom.quantityMaxCount == 1);
875
876 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
877
878 switch (term.atom.quantityType) {
879 case QuantifierGreedy:
880 if (backTrack->begin == notFound) {
881 context->term -= term.atom.parenthesesWidth;
882 return false;
883 }
884 FALLTHROUGH;
885 case QuantifierNonGreedy:
886 if (backTrack->begin == notFound) {
887 backTrack->begin = input.getPos();
888 if (term.capture()) {
889 // Technically this access to inputPosition should be accessing the begin term's
890 // inputPosition, but for repeats other than fixed these values should be
891 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
892 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
893 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
894 unsigned subpatternId = term.atom.subpatternId;
895 output[subpatternId << 1] = input.getPos() - term.inputPosition;
896 }
897 context->term -= term.atom.parenthesesWidth;
898 return true;
899 }
900 FALLTHROUGH;
901 case QuantifierFixedCount:
902 break;
903 }
904
905 return false;
906 }
907
908 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
909 {
910 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
911 ASSERT(term.atom.quantityType == QuantifierGreedy);
912 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
913 ASSERT(!term.capture());
914
915 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
916 backTrack->begin = input.getPos();
917 return true;
918 }
919
920 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
921 {
922 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
923
924 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
925 // Empty match is a failed match.
926 if (backTrack->begin == input.getPos())
927 return false;
928
929 // Successful match! Okay, what's next? - loop around and try to match more!
930 context->term -= (term.atom.parenthesesWidth + 1);
931 return true;
932 }
933
934 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
935 {
936 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
937 ASSERT(term.atom.quantityType == QuantifierGreedy);
938 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
939 ASSERT(!term.capture());
940
941 // If we backtrack to this point, we have failed to match this iteration of the parens.
942 // Since this is greedy / zero minimum a failed is also accepted as a match!
943 context->term += term.atom.parenthesesWidth;
944 return true;
945 }
946
947 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
948 {
949 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
950 // should always be returned as a successful match - we should never backtrack to here.
951 RELEASE_ASSERT_NOT_REACHED();
952 return false;
953 }
954
955 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
956 {
957 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
958 ASSERT(term.atom.quantityMaxCount == 1);
959
960 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
961
962 backTrack->begin = input.getPos();
963 return true;
964 }
965
966 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
967 {
968 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
969 ASSERT(term.atom.quantityMaxCount == 1);
970
971 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
972
973 input.setPos(backTrack->begin);
974
975 // We've reached the end of the parens; if they are inverted, this is failure.
976 if (term.invert()) {
977 context->term -= term.atom.parenthesesWidth;
978 return false;
979 }
980
981 return true;
982 }
983
984 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
985 {
986 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
987 ASSERT(term.atom.quantityMaxCount == 1);
988
989 // We've failed to match parens; if they are inverted, this is win!
990 if (term.invert()) {
991 context->term += term.atom.parenthesesWidth;
992 return true;
993 }
994
995 return false;
996 }
997
998 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
999 {
1000 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
1001 ASSERT(term.atom.quantityMaxCount == 1);
1002
1003 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
1004
1005 input.setPos(backTrack->begin);
1006
1007 context->term -= term.atom.parenthesesWidth;
1008 return false;
1009 }
1010
1011 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
1012 {
1013 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1014
1015 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1016 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1017
1018 backTrack->matchAmount = 0;
1019 backTrack->lastContext = 0;
1020
1021 ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount);
1022
1023 unsigned minimumMatchCount = term.atom.quantityMinCount;
1024 JSRegExpResult fixedMatchResult;
1025
1026 // Handle fixed matches and the minimum part of a variable length match.
1027 if (minimumMatchCount) {
1028 // While we haven't yet reached our fixed limit,
1029 while (backTrack->matchAmount < minimumMatchCount) {
1030 // Try to do a match, and it it succeeds, add it to the list.
1031 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1032 fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1033 if (fixedMatchResult == JSRegExpMatch)
1034 appendParenthesesDisjunctionContext(backTrack, context);
1035 else {
1036 // The match failed; try to find an alternate point to carry on from.
1037 resetMatches(term, context);
1038 freeParenthesesDisjunctionContext(context);
1039
1040 if (fixedMatchResult != JSRegExpNoMatch)
1041 return fixedMatchResult;
1042 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1043 if (backtrackResult != JSRegExpMatch)
1044 return backtrackResult;
1045 }
1046 }
1047
1048 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1049 recordParenthesesMatch(term, context);
1050 }
1051
1052 switch (term.atom.quantityType) {
1053 case QuantifierFixedCount: {
1054 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1055 return JSRegExpMatch;
1056 }
1057
1058 case QuantifierGreedy: {
1059 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1060 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1061 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1062 if (result == JSRegExpMatch)
1063 appendParenthesesDisjunctionContext(backTrack, context);
1064 else {
1065 resetMatches(term, context);
1066 freeParenthesesDisjunctionContext(context);
1067
1068 if (result != JSRegExpNoMatch)
1069 return result;
1070
1071 break;
1072 }
1073 }
1074
1075 if (backTrack->matchAmount) {
1076 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1077 recordParenthesesMatch(term, context);
1078 }
1079 return JSRegExpMatch;
1080 }
1081
1082 case QuantifierNonGreedy:
1083 return JSRegExpMatch;
1084 }
1085
1086 RELEASE_ASSERT_NOT_REACHED();
1087 return JSRegExpErrorNoMatch;
1088 }
1089
1090 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
1091 //
1092 // Greedy matches never should try just adding more - you should already have done
1093 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
1094 // you backtrack an item off the list needs checking, since we'll never have matched
1095 // the one less case. Tracking forwards, still add as much as possible.
1096 //
1097 // Non-greedy, we've already done the one less case, so don't match on popping.
1098 // We haven't done the one more case, so always try to add that.
1099 //
1100 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
1101 {
1102 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1103
1104 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1105 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1106
1107 switch (term.atom.quantityType) {
1108 case QuantifierFixedCount: {
1109 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1110
1111 ParenthesesDisjunctionContext* context = 0;
1112 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
1113
1114 if (result != JSRegExpMatch)
1115 return result;
1116
1117 // While we haven't yet reached our fixed limit,
1118 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1119 // Try to do a match, and it it succeeds, add it to the list.
1120 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1121 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1122
1123 if (result == JSRegExpMatch)
1124 appendParenthesesDisjunctionContext(backTrack, context);
1125 else {
1126 // The match failed; try to find an alternate point to carry on from.
1127 resetMatches(term, context);
1128 freeParenthesesDisjunctionContext(context);
1129
1130 if (result != JSRegExpNoMatch)
1131 return result;
1132 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1133 if (backtrackResult != JSRegExpMatch)
1134 return backtrackResult;
1135 }
1136 }
1137
1138 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1139 context = backTrack->lastContext;
1140 recordParenthesesMatch(term, context);
1141 return JSRegExpMatch;
1142 }
1143
1144 case QuantifierGreedy: {
1145 if (!backTrack->matchAmount)
1146 return JSRegExpNoMatch;
1147
1148 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1149 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1150 if (result == JSRegExpMatch) {
1151 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1152 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1153 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1154 if (parenthesesResult == JSRegExpMatch)
1155 appendParenthesesDisjunctionContext(backTrack, context);
1156 else {
1157 resetMatches(term, context);
1158 freeParenthesesDisjunctionContext(context);
1159
1160 if (parenthesesResult != JSRegExpNoMatch)
1161 return parenthesesResult;
1162
1163 break;
1164 }
1165 }
1166 } else {
1167 resetMatches(term, context);
1168 popParenthesesDisjunctionContext(backTrack);
1169 freeParenthesesDisjunctionContext(context);
1170
1171 if (result != JSRegExpNoMatch || backTrack->matchAmount < term.atom.quantityMinCount)
1172 return result;
1173 }
1174
1175 if (backTrack->matchAmount) {
1176 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1177 recordParenthesesMatch(term, context);
1178 }
1179 return JSRegExpMatch;
1180 }
1181
1182 case QuantifierNonGreedy: {
1183 // If we've not reached the limit, try to add one more match.
1184 if (backTrack->matchAmount < term.atom.quantityMaxCount) {
1185 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1186 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1187 if (result == JSRegExpMatch) {
1188 appendParenthesesDisjunctionContext(backTrack, context);
1189 recordParenthesesMatch(term, context);
1190 return JSRegExpMatch;
1191 }
1192
1193 resetMatches(term, context);
1194 freeParenthesesDisjunctionContext(context);
1195
1196 if (result != JSRegExpNoMatch)
1197 return result;
1198 }
1199
1200 // Nope - okay backtrack looking for an alternative.
1201 while (backTrack->matchAmount) {
1202 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1203 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1204 if (result == JSRegExpMatch) {
1205 // successful backtrack! we're back in the game!
1206 if (backTrack->matchAmount) {
1207 context = backTrack->lastContext;
1208 recordParenthesesMatch(term, context);
1209 }
1210 return JSRegExpMatch;
1211 }
1212
1213 // pop a match off the stack
1214 resetMatches(term, context);
1215 popParenthesesDisjunctionContext(backTrack);
1216 freeParenthesesDisjunctionContext(context);
1217
1218 if (result != JSRegExpNoMatch)
1219 return result;
1220 }
1221
1222 return JSRegExpNoMatch;
1223 }
1224 }
1225
1226 RELEASE_ASSERT_NOT_REACHED();
1227 return JSRegExpErrorNoMatch;
1228 }
1229
1230 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1231 {
1232 UNUSED_PARAM(term);
1233
1234 if (pattern->dotAll()) {
1235 context->matchBegin = startOffset;
1236 context->matchEnd = input.end();
1237 return true;
1238 }
1239
1240 unsigned matchBegin = context->matchBegin;
1241
1242 if (matchBegin > startOffset) {
1243 for (matchBegin--; true; matchBegin--) {
1244 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1245 ++matchBegin;
1246 break;
1247 }
1248
1249 if (matchBegin == startOffset)
1250 break;
1251 }
1252 }
1253
1254 unsigned matchEnd = input.getPos();
1255
1256 for (; (matchEnd != input.end())
1257 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1258
1259 if (((matchBegin && term.anchors.m_bol)
1260 || ((matchEnd != input.end()) && term.anchors.m_eol))
1261 && !pattern->multiline())
1262 return false;
1263
1264 context->matchBegin = matchBegin;
1265 context->matchEnd = matchEnd;
1266 return true;
1267 }
1268
1269#define MATCH_NEXT() { ++context->term; goto matchAgain; }
1270#define BACKTRACK() { --context->term; goto backtrack; }
1271#define currentTerm() (disjunction->terms[context->term])
1272 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1273 {
1274 if (!--remainingMatchCount)
1275 return JSRegExpErrorHitLimit;
1276
1277 if (btrack)
1278 BACKTRACK();
1279
1280 context->matchBegin = input.getPos();
1281 context->term = 0;
1282
1283 matchAgain:
1284 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1285
1286 switch (currentTerm().type) {
1287 case ByteTerm::TypeSubpatternBegin:
1288 MATCH_NEXT();
1289 case ByteTerm::TypeSubpatternEnd:
1290 context->matchEnd = input.getPos();
1291 return JSRegExpMatch;
1292
1293 case ByteTerm::TypeBodyAlternativeBegin:
1294 MATCH_NEXT();
1295 case ByteTerm::TypeBodyAlternativeDisjunction:
1296 case ByteTerm::TypeBodyAlternativeEnd:
1297 context->matchEnd = input.getPos();
1298 return JSRegExpMatch;
1299
1300 case ByteTerm::TypeAlternativeBegin:
1301 MATCH_NEXT();
1302 case ByteTerm::TypeAlternativeDisjunction:
1303 case ByteTerm::TypeAlternativeEnd: {
1304 int offset = currentTerm().alternative.end;
1305 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1306 backTrack->offset = offset;
1307 context->term += offset;
1308 MATCH_NEXT();
1309 }
1310
1311 case ByteTerm::TypeAssertionBOL:
1312 if (matchAssertionBOL(currentTerm()))
1313 MATCH_NEXT();
1314 BACKTRACK();
1315 case ByteTerm::TypeAssertionEOL:
1316 if (matchAssertionEOL(currentTerm()))
1317 MATCH_NEXT();
1318 BACKTRACK();
1319 case ByteTerm::TypeAssertionWordBoundary:
1320 if (matchAssertionWordBoundary(currentTerm()))
1321 MATCH_NEXT();
1322 BACKTRACK();
1323
1324 case ByteTerm::TypePatternCharacterOnce:
1325 case ByteTerm::TypePatternCharacterFixed: {
1326 if (unicode) {
1327 if (!U_IS_BMP(currentTerm().atom.patternCharacter)) {
1328 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1329 if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) {
1330 BACKTRACK();
1331 }
1332 }
1333 MATCH_NEXT();
1334 }
1335 }
1336 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1337
1338 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1339 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) {
1340 input.setPos(position);
1341 BACKTRACK();
1342 }
1343 }
1344 MATCH_NEXT();
1345 }
1346 case ByteTerm::TypePatternCharacterGreedy: {
1347 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1348 unsigned matchAmount = 0;
1349 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1350 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1351 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1352 input.setPos(position);
1353 break;
1354 }
1355 ++matchAmount;
1356 position = input.getPos();
1357 }
1358 backTrack->matchAmount = matchAmount;
1359
1360 MATCH_NEXT();
1361 }
1362 case ByteTerm::TypePatternCharacterNonGreedy: {
1363 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1364 backTrack->begin = input.getPos();
1365 backTrack->matchAmount = 0;
1366 MATCH_NEXT();
1367 }
1368
1369 case ByteTerm::TypePatternCasedCharacterOnce:
1370 case ByteTerm::TypePatternCasedCharacterFixed: {
1371 if (unicode) {
1372 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1373 ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter));
1374
1375 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1376
1377 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1378 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) {
1379 input.setPos(position);
1380 BACKTRACK();
1381 }
1382 }
1383 MATCH_NEXT();
1384 }
1385
1386 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1387 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1388 BACKTRACK();
1389 }
1390 MATCH_NEXT();
1391 }
1392 case ByteTerm::TypePatternCasedCharacterGreedy: {
1393 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1394
1395 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1396 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1397
1398 unsigned matchAmount = 0;
1399 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1400 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1401 input.uncheckInput(1);
1402 break;
1403 }
1404 ++matchAmount;
1405 }
1406 backTrack->matchAmount = matchAmount;
1407
1408 MATCH_NEXT();
1409 }
1410 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1411 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1412
1413 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1414 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1415
1416 backTrack->matchAmount = 0;
1417 MATCH_NEXT();
1418 }
1419
1420 case ByteTerm::TypeCharacterClass:
1421 if (matchCharacterClass(currentTerm(), context))
1422 MATCH_NEXT();
1423 BACKTRACK();
1424 case ByteTerm::TypeBackReference:
1425 if (matchBackReference(currentTerm(), context))
1426 MATCH_NEXT();
1427 BACKTRACK();
1428 case ByteTerm::TypeParenthesesSubpattern: {
1429 JSRegExpResult result = matchParentheses(currentTerm(), context);
1430
1431 if (result == JSRegExpMatch) {
1432 MATCH_NEXT();
1433 } else if (result != JSRegExpNoMatch)
1434 return result;
1435
1436 BACKTRACK();
1437 }
1438 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1439 if (matchParenthesesOnceBegin(currentTerm(), context))
1440 MATCH_NEXT();
1441 BACKTRACK();
1442 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1443 if (matchParenthesesOnceEnd(currentTerm(), context))
1444 MATCH_NEXT();
1445 BACKTRACK();
1446 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1447 if (matchParenthesesTerminalBegin(currentTerm(), context))
1448 MATCH_NEXT();
1449 BACKTRACK();
1450 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1451 if (matchParenthesesTerminalEnd(currentTerm(), context))
1452 MATCH_NEXT();
1453 BACKTRACK();
1454 case ByteTerm::TypeParentheticalAssertionBegin:
1455 if (matchParentheticalAssertionBegin(currentTerm(), context))
1456 MATCH_NEXT();
1457 BACKTRACK();
1458 case ByteTerm::TypeParentheticalAssertionEnd:
1459 if (matchParentheticalAssertionEnd(currentTerm(), context))
1460 MATCH_NEXT();
1461 BACKTRACK();
1462
1463 case ByteTerm::TypeCheckInput:
1464 if (input.checkInput(currentTerm().checkInputCount))
1465 MATCH_NEXT();
1466 BACKTRACK();
1467
1468 case ByteTerm::TypeUncheckInput:
1469 input.uncheckInput(currentTerm().checkInputCount);
1470 MATCH_NEXT();
1471
1472 case ByteTerm::TypeDotStarEnclosure:
1473 if (matchDotStarEnclosure(currentTerm(), context))
1474 return JSRegExpMatch;
1475 BACKTRACK();
1476 }
1477
1478 // We should never fall-through to here.
1479 RELEASE_ASSERT_NOT_REACHED();
1480
1481 backtrack:
1482 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1483
1484 switch (currentTerm().type) {
1485 case ByteTerm::TypeSubpatternBegin:
1486 return JSRegExpNoMatch;
1487 case ByteTerm::TypeSubpatternEnd:
1488 RELEASE_ASSERT_NOT_REACHED();
1489
1490 case ByteTerm::TypeBodyAlternativeBegin:
1491 case ByteTerm::TypeBodyAlternativeDisjunction: {
1492 int offset = currentTerm().alternative.next;
1493 context->term += offset;
1494 if (offset > 0)
1495 MATCH_NEXT();
1496
1497 if (input.atEnd() || pattern->sticky())
1498 return JSRegExpNoMatch;
1499
1500 input.next();
1501
1502 context->matchBegin = input.getPos();
1503
1504 if (currentTerm().alternative.onceThrough)
1505 context->term += currentTerm().alternative.next;
1506
1507 MATCH_NEXT();
1508 }
1509 case ByteTerm::TypeBodyAlternativeEnd:
1510 RELEASE_ASSERT_NOT_REACHED();
1511
1512 case ByteTerm::TypeAlternativeBegin:
1513 case ByteTerm::TypeAlternativeDisjunction: {
1514 int offset = currentTerm().alternative.next;
1515 context->term += offset;
1516 if (offset > 0)
1517 MATCH_NEXT();
1518 BACKTRACK();
1519 }
1520 case ByteTerm::TypeAlternativeEnd: {
1521 // We should never backtrack back into an alternative of the main body of the regex.
1522 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1523 unsigned offset = backTrack->offset;
1524 context->term -= offset;
1525 BACKTRACK();
1526 }
1527
1528 case ByteTerm::TypeAssertionBOL:
1529 case ByteTerm::TypeAssertionEOL:
1530 case ByteTerm::TypeAssertionWordBoundary:
1531 BACKTRACK();
1532
1533 case ByteTerm::TypePatternCharacterOnce:
1534 case ByteTerm::TypePatternCharacterFixed:
1535 case ByteTerm::TypePatternCharacterGreedy:
1536 case ByteTerm::TypePatternCharacterNonGreedy:
1537 if (backtrackPatternCharacter(currentTerm(), context))
1538 MATCH_NEXT();
1539 BACKTRACK();
1540 case ByteTerm::TypePatternCasedCharacterOnce:
1541 case ByteTerm::TypePatternCasedCharacterFixed:
1542 case ByteTerm::TypePatternCasedCharacterGreedy:
1543 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1544 if (backtrackPatternCasedCharacter(currentTerm(), context))
1545 MATCH_NEXT();
1546 BACKTRACK();
1547 case ByteTerm::TypeCharacterClass:
1548 if (backtrackCharacterClass(currentTerm(), context))
1549 MATCH_NEXT();
1550 BACKTRACK();
1551 case ByteTerm::TypeBackReference:
1552 if (backtrackBackReference(currentTerm(), context))
1553 MATCH_NEXT();
1554 BACKTRACK();
1555 case ByteTerm::TypeParenthesesSubpattern: {
1556 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1557
1558 if (result == JSRegExpMatch) {
1559 MATCH_NEXT();
1560 } else if (result != JSRegExpNoMatch)
1561 return result;
1562
1563 BACKTRACK();
1564 }
1565 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1566 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1567 MATCH_NEXT();
1568 BACKTRACK();
1569 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1570 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1571 MATCH_NEXT();
1572 BACKTRACK();
1573 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1574 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1575 MATCH_NEXT();
1576 BACKTRACK();
1577 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1578 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1579 MATCH_NEXT();
1580 BACKTRACK();
1581 case ByteTerm::TypeParentheticalAssertionBegin:
1582 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1583 MATCH_NEXT();
1584 BACKTRACK();
1585 case ByteTerm::TypeParentheticalAssertionEnd:
1586 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1587 MATCH_NEXT();
1588 BACKTRACK();
1589
1590 case ByteTerm::TypeCheckInput:
1591 input.uncheckInput(currentTerm().checkInputCount);
1592 BACKTRACK();
1593
1594 case ByteTerm::TypeUncheckInput:
1595 input.checkInput(currentTerm().checkInputCount);
1596 BACKTRACK();
1597
1598 case ByteTerm::TypeDotStarEnclosure:
1599 RELEASE_ASSERT_NOT_REACHED();
1600 }
1601
1602 RELEASE_ASSERT_NOT_REACHED();
1603 return JSRegExpErrorNoMatch;
1604 }
1605
1606 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1607 {
1608 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1609
1610 if (result == JSRegExpMatch) {
1611 while (context->matchBegin == context->matchEnd) {
1612 result = matchDisjunction(disjunction, context, true);
1613 if (result != JSRegExpMatch)
1614 return result;
1615 }
1616 return JSRegExpMatch;
1617 }
1618
1619 return result;
1620 }
1621
1622 unsigned interpret()
1623 {
1624 // FIXME: https://bugs.webkit.org/show_bug.cgi?id=195970
1625 // [Yarr Interpreter] The interpreter doesn't have checks for stack overflow due to deep recursion
1626 if (!input.isAvailableInput(0))
1627 return offsetNoMatch;
1628
1629 if (pattern->m_lock)
1630 pattern->m_lock->lock();
1631
1632 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1633 output[i << 1] = offsetNoMatch;
1634
1635 allocatorPool = pattern->m_allocator->startAllocator();
1636 RELEASE_ASSERT(allocatorPool);
1637
1638 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1639
1640 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1641 if (result == JSRegExpMatch) {
1642 output[0] = context->matchBegin;
1643 output[1] = context->matchEnd;
1644 }
1645
1646 freeDisjunctionContext(context);
1647
1648 pattern->m_allocator->stopAllocator();
1649
1650 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1651
1652 if (pattern->m_lock)
1653 pattern->m_lock->unlock();
1654
1655 return output[0];
1656 }
1657
1658 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1659 : pattern(pattern)
1660 , unicode(pattern->unicode())
1661 , output(output)
1662 , input(input, start, length, pattern->unicode())
1663 , startOffset(start)
1664 , remainingMatchCount(matchLimit)
1665 {
1666 }
1667
1668private:
1669 BytecodePattern* pattern;
1670 bool unicode;
1671 unsigned* output;
1672 InputStream input;
1673 WTF::BumpPointerPool* allocatorPool { nullptr };
1674 unsigned startOffset;
1675 unsigned remainingMatchCount;
1676};
1677
1678class ByteCompiler {
1679 struct ParenthesesStackEntry {
1680 unsigned beginTerm;
1681 unsigned savedAlternativeIndex;
1682 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1683 : beginTerm(beginTerm)
1684 , savedAlternativeIndex(savedAlternativeIndex)
1685 {
1686 }
1687 };
1688
1689public:
1690 ByteCompiler(YarrPattern& pattern)
1691 : m_pattern(pattern)
1692 {
1693 }
1694
1695 std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock, ErrorCode& errorCode)
1696 {
1697 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1698 if (auto error = emitDisjunction(m_pattern.m_body, 0, 0)) {
1699 errorCode = error.value();
1700 return nullptr;
1701 }
1702 regexEnd();
1703
1704#ifndef NDEBUG
1705 if (Options::dumpCompiledRegExpPatterns())
1706 dumpDisjunction(m_bodyDisjunction.get());
1707#endif
1708
1709 return std::make_unique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock);
1710 }
1711
1712 void checkInput(unsigned count)
1713 {
1714 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1715 }
1716
1717 void uncheckInput(unsigned count)
1718 {
1719 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1720 }
1721
1722 void assertionBOL(unsigned inputPosition)
1723 {
1724 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1725 }
1726
1727 void assertionEOL(unsigned inputPosition)
1728 {
1729 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1730 }
1731
1732 void assertionWordBoundary(bool invert, unsigned inputPosition)
1733 {
1734 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1735 }
1736
1737 void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1738 {
1739 if (m_pattern.ignoreCase()) {
1740 UChar32 lo = u_tolower(ch);
1741 UChar32 hi = u_toupper(ch);
1742
1743 if (lo != hi) {
1744 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType));
1745 return;
1746 }
1747 }
1748
1749 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType));
1750 }
1751
1752 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1753 {
1754 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1755
1756 m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1757 m_bodyDisjunction->terms.last().atom.quantityType = quantityType;
1758 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1759 }
1760
1761 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1762 {
1763 ASSERT(subpatternId);
1764
1765 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1766
1767 m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1768 m_bodyDisjunction->terms.last().atom.quantityType = quantityType;
1769 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1770 }
1771
1772 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1773 {
1774 unsigned beginTerm = m_bodyDisjunction->terms.size();
1775
1776 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1777 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1778 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1779 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1780
1781 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1782 m_currentAlternativeIndex = beginTerm + 1;
1783 }
1784
1785 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1786 {
1787 unsigned beginTerm = m_bodyDisjunction->terms.size();
1788
1789 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1790 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1791 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1792 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1793
1794 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1795 m_currentAlternativeIndex = beginTerm + 1;
1796 }
1797
1798 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1799 {
1800 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1801 // then fix this up at the end! - simplifying this should make it much clearer.
1802 // https://bugs.webkit.org/show_bug.cgi?id=50136
1803
1804 unsigned beginTerm = m_bodyDisjunction->terms.size();
1805
1806 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1807 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1808 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1809 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1810
1811 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1812 m_currentAlternativeIndex = beginTerm + 1;
1813 }
1814
1815 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1816 {
1817 unsigned beginTerm = m_bodyDisjunction->terms.size();
1818
1819 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1820 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1821 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1822 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1823
1824 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1825 m_currentAlternativeIndex = beginTerm + 1;
1826 }
1827
1828 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1829 {
1830 unsigned beginTerm = popParenthesesStack();
1831 closeAlternative(beginTerm + 1);
1832 unsigned endTerm = m_bodyDisjunction->terms.size();
1833
1834 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1835
1836 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1837 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1838
1839 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1840 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1841 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1842 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1843
1844 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1845 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1846 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1847 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1848 }
1849
1850 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1851 {
1852 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1853 }
1854
1855 unsigned popParenthesesStack()
1856 {
1857 ASSERT(m_parenthesesStack.size());
1858 unsigned beginTerm = m_parenthesesStack.last().beginTerm;
1859 m_currentAlternativeIndex = m_parenthesesStack.last().savedAlternativeIndex;
1860 m_parenthesesStack.removeLast();
1861
1862 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1863 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1864
1865 return beginTerm;
1866 }
1867
1868 void closeAlternative(unsigned beginTerm)
1869 {
1870 unsigned origBeginTerm = beginTerm;
1871 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1872 unsigned endIndex = m_bodyDisjunction->terms.size();
1873
1874 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1875
1876 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1877 m_bodyDisjunction->terms.remove(beginTerm);
1878 else {
1879 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1880 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1881 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1882 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1883 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1884 }
1885
1886 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1887
1888 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1889 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1890 }
1891 }
1892
1893 void closeBodyAlternative()
1894 {
1895 unsigned beginTerm = 0;
1896 unsigned origBeginTerm = 0;
1897 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1898 unsigned endIndex = m_bodyDisjunction->terms.size();
1899
1900 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1901
1902 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1903 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1904 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1905 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1906 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1907 }
1908
1909 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1910
1911 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1912 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1913 }
1914
1915 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1916 {
1917 unsigned beginTerm = popParenthesesStack();
1918 closeAlternative(beginTerm + 1);
1919 unsigned endTerm = m_bodyDisjunction->terms.size();
1920
1921 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1922
1923 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1924
1925 bool capture = parenthesesBegin.capture();
1926 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1927
1928 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1929 auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
1930
1931 unsigned firstTermInParentheses = beginTerm + 1;
1932 parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
1933
1934 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1935 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
1936 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1937 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1938
1939 m_bodyDisjunction->terms.shrink(beginTerm);
1940
1941 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
1942 m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction));
1943
1944 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1945 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1946 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1947 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1948 }
1949
1950 void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1951 {
1952 unsigned beginTerm = popParenthesesStack();
1953 closeAlternative(beginTerm + 1);
1954 unsigned endTerm = m_bodyDisjunction->terms.size();
1955
1956 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1957
1958 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1959 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1960
1961 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1962 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1963 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1964 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1965
1966 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1967 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1968 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1969 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1970 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1971 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1972 }
1973
1974 void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1975 {
1976 unsigned beginTerm = popParenthesesStack();
1977 closeAlternative(beginTerm + 1);
1978 unsigned endTerm = m_bodyDisjunction->terms.size();
1979
1980 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1981
1982 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1983 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1984
1985 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1986 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1987 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1988 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1989
1990 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1991 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1992 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1993 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1994 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1995 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1996 }
1997
1998 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1999 {
2000 m_bodyDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
2001 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
2002 m_bodyDisjunction->terms[0].frameLocation = 0;
2003 m_currentAlternativeIndex = 0;
2004 }
2005
2006 void regexEnd()
2007 {
2008 closeBodyAlternative();
2009 }
2010
2011 void alternativeBodyDisjunction(bool onceThrough)
2012 {
2013 unsigned newAlternativeIndex = m_bodyDisjunction->terms.size();
2014 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
2015 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
2016
2017 m_currentAlternativeIndex = newAlternativeIndex;
2018 }
2019
2020 void alternativeDisjunction()
2021 {
2022 unsigned newAlternativeIndex = m_bodyDisjunction->terms.size();
2023 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
2024 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
2025
2026 m_currentAlternativeIndex = newAlternativeIndex;
2027 }
2028
2029 Optional<ErrorCode> WARN_UNUSED_RETURN emitDisjunction(PatternDisjunction* disjunction, Checked<unsigned, RecordOverflow> inputCountAlreadyChecked, unsigned parenthesesInputCountAlreadyChecked)
2030 {
2031 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
2032 auto currentCountAlreadyChecked = inputCountAlreadyChecked;
2033
2034 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
2035
2036 if (alt) {
2037 if (disjunction == m_pattern.m_body)
2038 alternativeBodyDisjunction(alternative->onceThrough());
2039 else
2040 alternativeDisjunction();
2041 }
2042
2043 unsigned minimumSize = alternative->m_minimumSize;
2044 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
2045 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
2046
2047 if (countToCheck) {
2048 checkInput(countToCheck);
2049 currentCountAlreadyChecked += countToCheck;
2050 if (currentCountAlreadyChecked.hasOverflowed())
2051 return ErrorCode::OffsetTooLarge;
2052 }
2053
2054 for (auto& term : alternative->m_terms) {
2055 switch (term.type) {
2056 case PatternTerm::TypeAssertionBOL:
2057 assertionBOL((currentCountAlreadyChecked - term.inputPosition).unsafeGet());
2058 break;
2059
2060 case PatternTerm::TypeAssertionEOL:
2061 assertionEOL((currentCountAlreadyChecked - term.inputPosition).unsafeGet());
2062 break;
2063
2064 case PatternTerm::TypeAssertionWordBoundary:
2065 assertionWordBoundary(term.invert(), (currentCountAlreadyChecked - term.inputPosition).unsafeGet());
2066 break;
2067
2068 case PatternTerm::TypePatternCharacter:
2069 atomPatternCharacter(term.patternCharacter, (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
2070 break;
2071
2072 case PatternTerm::TypeCharacterClass:
2073 atomCharacterClass(term.characterClass, term.invert(), (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
2074 break;
2075
2076 case PatternTerm::TypeBackReference:
2077 atomBackReference(term.backReferenceSubpatternId, (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
2078 break;
2079
2080 case PatternTerm::TypeForwardReference:
2081 break;
2082
2083 case PatternTerm::TypeParenthesesSubpattern: {
2084 unsigned disjunctionAlreadyCheckedCount = 0;
2085 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
2086 unsigned alternativeFrameLocation = term.frameLocation;
2087 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
2088 if (term.quantityType == QuantifierFixedCount)
2089 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
2090 else
2091 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
2092 unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2093 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
2094 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount))
2095 return error;
2096 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2097 } else if (term.parentheses.isTerminal) {
2098 unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2099 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal);
2100 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount))
2101 return error;
2102 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2103 } else {
2104 unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2105 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0);
2106 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0))
2107 return error;
2108 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
2109 }
2110 break;
2111 }
2112
2113 case PatternTerm::TypeParentheticalAssertion: {
2114 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
2115 unsigned positiveInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2116 unsigned uncheckAmount = 0;
2117 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
2118 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
2119 uncheckInput(uncheckAmount);
2120 currentCountAlreadyChecked -= uncheckAmount;
2121 if (currentCountAlreadyChecked.hasOverflowed())
2122 return ErrorCode::OffsetTooLarge;
2123 }
2124
2125 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
2126 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount))
2127 return error;
2128 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType);
2129 if (uncheckAmount) {
2130 checkInput(uncheckAmount);
2131 currentCountAlreadyChecked += uncheckAmount;
2132 if (currentCountAlreadyChecked.hasOverflowed())
2133 return ErrorCode::OffsetTooLarge;
2134 }
2135 break;
2136 }
2137
2138 case PatternTerm::TypeDotStarEnclosure:
2139 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
2140 break;
2141 }
2142 }
2143 }
2144 return WTF::nullopt;
2145 }
2146#ifndef NDEBUG
2147 void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0)
2148 {
2149 PrintStream& out = WTF::dataFile();
2150
2151 unsigned termIndexNest = 0;
2152
2153 if (!nesting) {
2154 out.printf("ByteDisjunction(%p):\n", disjunction);
2155 nesting = 1;
2156 } else {
2157 termIndexNest = nesting - 1;
2158 nesting = 2;
2159 }
2160
2161 auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) {
2162 for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++)
2163 out.print(" ");
2164 out.printf("%4zu", index);
2165 for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++)
2166 out.print(" ");
2167 };
2168
2169 auto dumpQuantity = [&](ByteTerm& term) {
2170 if (term.atom.quantityType == QuantifierFixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1)
2171 return;
2172
2173 out.print(" {", term.atom.quantityMinCount);
2174 if (term.atom.quantityMinCount != term.atom.quantityMaxCount) {
2175 if (term.atom.quantityMaxCount == UINT_MAX)
2176 out.print(",inf");
2177 else
2178 out.print(",", term.atom.quantityMaxCount);
2179 }
2180 out.print("}");
2181 if (term.atom.quantityType == QuantifierGreedy)
2182 out.print(" greedy");
2183 else if (term.atom.quantityType == QuantifierNonGreedy)
2184 out.print(" non-greedy");
2185 };
2186
2187 auto dumpCaptured = [&](ByteTerm& term) {
2188 if (term.capture())
2189 out.print(" captured (#", term.atom.subpatternId, ")");
2190 };
2191
2192 auto dumpInverted = [&](ByteTerm& term) {
2193 if (term.invert())
2194 out.print(" inverted");
2195 };
2196
2197 auto dumpInputPosition = [&](ByteTerm& term) {
2198 out.printf(" inputPosition %u", term.inputPosition);
2199 };
2200
2201 auto dumpFrameLocation = [&](ByteTerm& term) {
2202 out.printf(" frameLocation %u", term.frameLocation);
2203 };
2204
2205 auto dumpCharacter = [&](ByteTerm& term) {
2206 out.print(" ");
2207 dumpUChar32(out, term.atom.patternCharacter);
2208 };
2209
2210 auto dumpCharClass = [&](ByteTerm& term) {
2211 out.print(" ");
2212 dumpCharacterClass(out, &m_pattern, term.atom.characterClass);
2213 };
2214
2215 for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) {
2216 ByteTerm term = disjunction->terms[idx];
2217
2218 bool outputNewline = true;
2219
2220 switch (term.type) {
2221 case ByteTerm::TypeBodyAlternativeBegin:
2222 outputTermIndexAndNest(idx, nesting++);
2223 out.print("BodyAlternativeBegin");
2224 if (term.alternative.onceThrough)
2225 out.print(" onceThrough");
2226 dumpFrameLocation(term);
2227 break;
2228 case ByteTerm::TypeBodyAlternativeDisjunction:
2229 outputTermIndexAndNest(idx, nesting - 1);
2230 out.print("BodyAlternativeDisjunction");
2231 dumpFrameLocation(term);
2232 break;
2233 case ByteTerm::TypeBodyAlternativeEnd:
2234 outputTermIndexAndNest(idx, --nesting);
2235 out.print("BodyAlternativeEnd");
2236 dumpFrameLocation(term);
2237 break;
2238 case ByteTerm::TypeAlternativeBegin:
2239 outputTermIndexAndNest(idx, nesting++);
2240 out.print("AlternativeBegin");
2241 dumpFrameLocation(term);
2242 break;
2243 case ByteTerm::TypeAlternativeDisjunction:
2244 outputTermIndexAndNest(idx, nesting - 1);
2245 out.print("AlternativeDisjunction");
2246 dumpFrameLocation(term);
2247 break;
2248 case ByteTerm::TypeAlternativeEnd:
2249 outputTermIndexAndNest(idx, --nesting);
2250 out.print("AlternativeEnd");
2251 dumpFrameLocation(term);
2252 break;
2253 case ByteTerm::TypeSubpatternBegin:
2254 outputTermIndexAndNest(idx, nesting++);
2255 out.print("SubpatternBegin");
2256 break;
2257 case ByteTerm::TypeSubpatternEnd:
2258 outputTermIndexAndNest(idx, --nesting);
2259 out.print("SubpatternEnd");
2260 break;
2261 case ByteTerm::TypeAssertionBOL:
2262 outputTermIndexAndNest(idx, nesting);
2263 out.print("AssertionBOL");
2264 break;
2265 case ByteTerm::TypeAssertionEOL:
2266 outputTermIndexAndNest(idx, nesting);
2267 out.print("AssertionEOL");
2268 break;
2269 case ByteTerm::TypeAssertionWordBoundary:
2270 outputTermIndexAndNest(idx, nesting);
2271 out.print("AssertionWordBoundary");
2272 break;
2273 case ByteTerm::TypePatternCharacterOnce:
2274 outputTermIndexAndNest(idx, nesting);
2275 out.print("PatternCharacterOnce");
2276 dumpInverted(term);
2277 dumpInputPosition(term);
2278 dumpFrameLocation(term);
2279 dumpCharacter(term);
2280 dumpQuantity(term);
2281 break;
2282 case ByteTerm::TypePatternCharacterFixed:
2283 outputTermIndexAndNest(idx, nesting);
2284 out.print("PatternCharacterFixed");
2285 dumpInverted(term);
2286 dumpInputPosition(term);
2287 dumpFrameLocation(term);
2288 dumpCharacter(term);
2289 out.print(" {", term.atom.quantityMinCount, "}");
2290 break;
2291 case ByteTerm::TypePatternCharacterGreedy:
2292 outputTermIndexAndNest(idx, nesting);
2293 out.print("PatternCharacterGreedy");
2294 dumpInverted(term);
2295 dumpInputPosition(term);
2296 dumpFrameLocation(term);
2297 dumpCharacter(term);
2298 dumpQuantity(term);
2299 break;
2300 case ByteTerm::TypePatternCharacterNonGreedy:
2301 outputTermIndexAndNest(idx, nesting);
2302 out.print("PatternCharacterNonGreedy");
2303 dumpInverted(term);
2304 dumpInputPosition(term);
2305 dumpFrameLocation(term);
2306 dumpCharacter(term);
2307 dumpQuantity(term);
2308 break;
2309 case ByteTerm::TypePatternCasedCharacterOnce:
2310 outputTermIndexAndNest(idx, nesting);
2311 out.print("PatternCasedCharacterOnce");
2312 break;
2313 case ByteTerm::TypePatternCasedCharacterFixed:
2314 outputTermIndexAndNest(idx, nesting);
2315 out.print("PatternCasedCharacterFixed");
2316 break;
2317 case ByteTerm::TypePatternCasedCharacterGreedy:
2318 outputTermIndexAndNest(idx, nesting);
2319 out.print("PatternCasedCharacterGreedy");
2320 break;
2321 case ByteTerm::TypePatternCasedCharacterNonGreedy:
2322 outputTermIndexAndNest(idx, nesting);
2323 out.print("PatternCasedCharacterNonGreedy");
2324 break;
2325 case ByteTerm::TypeCharacterClass:
2326 outputTermIndexAndNest(idx, nesting);
2327 out.print("CharacterClass");
2328 dumpInverted(term);
2329 dumpInputPosition(term);
2330 dumpFrameLocation(term);
2331 dumpCharClass(term);
2332 dumpQuantity(term);
2333 break;
2334 case ByteTerm::TypeBackReference:
2335 outputTermIndexAndNest(idx, nesting);
2336 out.print("BackReference #", term.atom.subpatternId);
2337 dumpQuantity(term);
2338 break;
2339 case ByteTerm::TypeParenthesesSubpattern:
2340 outputTermIndexAndNest(idx, nesting);
2341 out.print("ParenthesesSubpattern");
2342 dumpCaptured(term);
2343 dumpInverted(term);
2344 dumpInputPosition(term);
2345 dumpFrameLocation(term);
2346 dumpQuantity(term);
2347 out.print("\n");
2348 outputNewline = false;
2349 dumpDisjunction(term.atom.parenthesesDisjunction, nesting);
2350 break;
2351 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
2352 outputTermIndexAndNest(idx, nesting++);
2353 out.print("ParenthesesSubpatternOnceBegin");
2354 dumpCaptured(term);
2355 dumpInverted(term);
2356 dumpInputPosition(term);
2357 dumpFrameLocation(term);
2358 break;
2359 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
2360 outputTermIndexAndNest(idx, --nesting);
2361 out.print("ParenthesesSubpatternOnceEnd");
2362 dumpFrameLocation(term);
2363 break;
2364 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
2365 outputTermIndexAndNest(idx, nesting++);
2366 out.print("ParenthesesSubpatternTerminalBegin");
2367 dumpInverted(term);
2368 dumpInputPosition(term);
2369 dumpFrameLocation(term);
2370 break;
2371 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
2372 outputTermIndexAndNest(idx, --nesting);
2373 out.print("ParenthesesSubpatternTerminalEnd");
2374 dumpFrameLocation(term);
2375 break;
2376 case ByteTerm::TypeParentheticalAssertionBegin:
2377 outputTermIndexAndNest(idx, nesting++);
2378 out.print("ParentheticalAssertionBegin");
2379 dumpInverted(term);
2380 dumpInputPosition(term);
2381 dumpFrameLocation(term);
2382 break;
2383 case ByteTerm::TypeParentheticalAssertionEnd:
2384 outputTermIndexAndNest(idx, --nesting);
2385 out.print("ParentheticalAssertionEnd");
2386 dumpFrameLocation(term);
2387 break;
2388 case ByteTerm::TypeCheckInput:
2389 outputTermIndexAndNest(idx, nesting);
2390 out.print("CheckInput ", term.checkInputCount);
2391 break;
2392 case ByteTerm::TypeUncheckInput:
2393 outputTermIndexAndNest(idx, nesting);
2394 out.print("UncheckInput ", term.checkInputCount);
2395 break;
2396 case ByteTerm::TypeDotStarEnclosure:
2397 outputTermIndexAndNest(idx, nesting);
2398 out.print("DotStarEnclosure");
2399 break;
2400 }
2401 if (outputNewline)
2402 out.print("\n");
2403 }
2404 }
2405#endif
2406
2407private:
2408 YarrPattern& m_pattern;
2409 std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
2410 unsigned m_currentAlternativeIndex { 0 };
2411 Vector<ParenthesesStackEntry> m_parenthesesStack;
2412 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
2413};
2414
2415std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ErrorCode& errorCode, ConcurrentJSLock* lock)
2416{
2417 return ByteCompiler(pattern).compile(allocator, lock, errorCode);
2418}
2419
2420unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
2421{
2422 SuperSamplerScope superSamplerScope(false);
2423 if (input.is8Bit())
2424 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
2425 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
2426}
2427
2428unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
2429{
2430 SuperSamplerScope superSamplerScope(false);
2431 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
2432}
2433
2434unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
2435{
2436 SuperSamplerScope superSamplerScope(false);
2437 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
2438}
2439
2440// These should be the same for both UChar & LChar.
2441COMPILE_ASSERT(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
2442COMPILE_ASSERT(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
2443COMPILE_ASSERT(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
2444COMPILE_ASSERT(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
2445COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
2446COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
2447COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
2448
2449
2450} }
2451