1/*
2 * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrPattern.h"
29
30#include "Options.h"
31#include "Yarr.h"
32#include "YarrCanonicalize.h"
33#include "YarrParser.h"
34#include <wtf/DataLog.h>
35#include <wtf/Optional.h>
36#include <wtf/StackPointer.h>
37#include <wtf/Threading.h>
38#include <wtf/Vector.h>
39
40namespace JSC { namespace Yarr {
41
42#include "RegExpJitTables.h"
43
44class CharacterClassConstructor {
45public:
46 CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
47 : m_isCaseInsensitive(isCaseInsensitive)
48 , m_anyCharacter(false)
49 , m_characterWidths(CharacterClassWidths::Unknown)
50 , m_canonicalMode(canonicalMode)
51 {
52 }
53
54 void reset()
55 {
56 m_matches.clear();
57 m_ranges.clear();
58 m_matchesUnicode.clear();
59 m_rangesUnicode.clear();
60 m_anyCharacter = false;
61 m_characterWidths = CharacterClassWidths::Unknown;
62 }
63
64 void append(const CharacterClass* other)
65 {
66 for (size_t i = 0; i < other->m_matches.size(); ++i)
67 addSorted(m_matches, other->m_matches[i]);
68 for (size_t i = 0; i < other->m_ranges.size(); ++i)
69 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
70 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
71 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
72 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
73 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
74 }
75
76 void appendInverted(const CharacterClass* other)
77 {
78 auto addSortedInverted = [&](UChar32 min, UChar32 max,
79 const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
80 Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
81
82 auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
83 if (lo < hiPlusOne) {
84 if (lo + 1 == hiPlusOne)
85 addSorted(destMatches, lo);
86 else
87 addSortedRange(destRanges, lo, hiPlusOne - 1);
88 }
89 };
90
91 UChar32 lo = min;
92 size_t matchesIndex = 0;
93 size_t rangesIndex = 0;
94 bool matchesRemaining = matchesIndex < srcMatches.size();
95 bool rangesRemaining = rangesIndex < srcRanges.size();
96
97 if (!matchesRemaining && !rangesRemaining) {
98 addSortedMatchOrRange(min, max + 1);
99 return;
100 }
101
102 while (matchesRemaining || rangesRemaining) {
103 UChar32 hiPlusOne;
104 UChar32 nextLo;
105
106 if (matchesRemaining
107 && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) {
108 hiPlusOne = srcMatches[matchesIndex];
109 nextLo = hiPlusOne + 1;
110 ++matchesIndex;
111 matchesRemaining = matchesIndex < srcMatches.size();
112 } else {
113 hiPlusOne = srcRanges[rangesIndex].begin;
114 nextLo = srcRanges[rangesIndex].end + 1;
115 ++rangesIndex;
116 rangesRemaining = rangesIndex < srcRanges.size();
117 }
118
119 addSortedMatchOrRange(lo, hiPlusOne);
120
121 lo = nextLo;
122 }
123
124 addSortedMatchOrRange(lo, max + 1);
125 };
126
127 addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges);
128 addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
129 }
130
131 void putChar(UChar32 ch)
132 {
133 if (!m_isCaseInsensitive) {
134 addSorted(ch);
135 return;
136 }
137
138 if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
139 // Handle ASCII cases.
140 if (isASCIIAlpha(ch)) {
141 addSorted(m_matches, toASCIIUpper(ch));
142 addSorted(m_matches, toASCIILower(ch));
143 } else
144 addSorted(m_matches, ch);
145 return;
146 }
147
148 // Add multiple matches, if necessary.
149 const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
150 if (info->type == CanonicalizeUnique)
151 addSorted(ch);
152 else
153 putUnicodeIgnoreCase(ch, info);
154 }
155
156 void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
157 {
158 ASSERT(m_isCaseInsensitive);
159 ASSERT(ch >= info->begin && ch <= info->end);
160 ASSERT(info->type != CanonicalizeUnique);
161 if (info->type == CanonicalizeSet) {
162 for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
163 addSorted(ch);
164 } else {
165 addSorted(ch);
166 addSorted(getCanonicalPair(info, ch));
167 }
168 }
169
170 void putRange(UChar32 lo, UChar32 hi)
171 {
172 if (isASCII(lo)) {
173 char asciiLo = lo;
174 char asciiHi = std::min(hi, (UChar32)0x7f);
175 addSortedRange(m_ranges, lo, asciiHi);
176
177 if (m_isCaseInsensitive) {
178 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
179 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
180 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
181 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
182 }
183 }
184 if (isASCII(hi))
185 return;
186
187 lo = std::max(lo, (UChar32)0x80);
188 addSortedRange(m_rangesUnicode, lo, hi);
189
190 if (!m_isCaseInsensitive)
191 return;
192
193 const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
194 while (true) {
195 // Handle the range [lo .. end]
196 UChar32 end = std::min<UChar32>(info->end, hi);
197
198 switch (info->type) {
199 case CanonicalizeUnique:
200 // Nothing to do - no canonical equivalents.
201 break;
202 case CanonicalizeSet: {
203 UChar ch;
204 for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
205 addSorted(m_matchesUnicode, ch);
206 break;
207 }
208 case CanonicalizeRangeLo:
209 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
210 break;
211 case CanonicalizeRangeHi:
212 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
213 break;
214 case CanonicalizeAlternatingAligned:
215 // Use addSortedRange since there is likely an abutting range to combine with.
216 if (lo & 1)
217 addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
218 if (!(end & 1))
219 addSortedRange(m_rangesUnicode, end + 1, end + 1);
220 break;
221 case CanonicalizeAlternatingUnaligned:
222 // Use addSortedRange since there is likely an abutting range to combine with.
223 if (!(lo & 1))
224 addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
225 if (end & 1)
226 addSortedRange(m_rangesUnicode, end + 1, end + 1);
227 break;
228 }
229
230 if (hi == end)
231 return;
232
233 ++info;
234 lo = info->begin;
235 };
236
237 }
238
239 std::unique_ptr<CharacterClass> charClass()
240 {
241 coalesceTables();
242
243 auto characterClass = makeUnique<CharacterClass>();
244
245 characterClass->m_matches.swap(m_matches);
246 characterClass->m_ranges.swap(m_ranges);
247 characterClass->m_matchesUnicode.swap(m_matchesUnicode);
248 characterClass->m_rangesUnicode.swap(m_rangesUnicode);
249 characterClass->m_anyCharacter = anyCharacter();
250 characterClass->m_characterWidths = characterWidths();
251
252 m_anyCharacter = false;
253 m_characterWidths = CharacterClassWidths::Unknown;
254
255 return characterClass;
256 }
257
258private:
259 void addSorted(UChar32 ch)
260 {
261 addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
262 }
263
264 void addSorted(Vector<UChar32>& matches, UChar32 ch)
265 {
266 unsigned pos = 0;
267 unsigned range = matches.size();
268
269 m_characterWidths |= (U_IS_BMP(ch) ? CharacterClassWidths::HasBMPChars : CharacterClassWidths::HasNonBMPChars);
270
271 // binary chop, find position to insert char.
272 while (range) {
273 unsigned index = range >> 1;
274
275 int val = matches[pos+index] - ch;
276 if (!val)
277 return;
278 else if (val > 0) {
279 if (val == 1) {
280 UChar32 lo = ch;
281 UChar32 hi = ch + 1;
282 matches.remove(pos + index);
283 if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
284 lo = ch - 1;
285 matches.remove(pos + index - 1);
286 }
287 addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
288 return;
289 }
290 range = index;
291 } else {
292 if (val == -1) {
293 UChar32 lo = ch - 1;
294 UChar32 hi = ch;
295 matches.remove(pos + index);
296 if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
297 hi = ch + 1;
298 matches.remove(pos + index + 1);
299 }
300 addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
301 return;
302 }
303 pos += (index+1);
304 range -= (index+1);
305 }
306 }
307
308 if (pos == matches.size())
309 matches.append(ch);
310 else
311 matches.insert(pos, ch);
312 }
313
314 void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
315 {
316 size_t end = ranges.size();
317
318 if (U_IS_BMP(lo))
319 m_characterWidths |= CharacterClassWidths::HasBMPChars;
320 if (!U_IS_BMP(hi))
321 m_characterWidths |= CharacterClassWidths::HasNonBMPChars;
322
323 // Simple linear scan - I doubt there are that many ranges anyway...
324 // feel free to fix this with something faster (eg binary chop).
325 for (size_t i = 0; i < end; ++i) {
326 // does the new range fall before the current position in the array
327 if (hi < ranges[i].begin) {
328 // Concatenate appending ranges.
329 if (hi == (ranges[i].begin - 1)) {
330 ranges[i].begin = lo;
331 return;
332 }
333 ranges.insert(i, CharacterRange(lo, hi));
334 return;
335 }
336 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
337 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
338 // end of the last range they concatenate, which is just as good.
339 if (lo <= (ranges[i].end + 1)) {
340 // found an intersect! we'll replace this entry in the array.
341 ranges[i].begin = std::min(ranges[i].begin, lo);
342 ranges[i].end = std::max(ranges[i].end, hi);
343
344 mergeRangesFrom(ranges, i);
345 return;
346 }
347 }
348
349 // CharacterRange comes after all existing ranges.
350 ranges.append(CharacterRange(lo, hi));
351 }
352
353 void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
354 {
355 unsigned next = index + 1;
356
357 // each iteration of the loop we will either remove something from the list, or break out of the loop.
358 while (next < ranges.size()) {
359 if (ranges[next].begin <= (ranges[index].end + 1)) {
360 // the next entry now overlaps / concatenates with this one.
361 ranges[index].end = std::max(ranges[index].end, ranges[next].end);
362 ranges.remove(next);
363 } else
364 break;
365 }
366
367 }
368
369 void coalesceTables()
370 {
371 auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
372
373 size_t matchesIndex = 0;
374 size_t rangesIndex = 0;
375
376 while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
377 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
378 matchesIndex++;
379
380 if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
381 ranges[rangesIndex].begin = matches[matchesIndex];
382 matches.remove(matchesIndex);
383 }
384
385 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
386 matchesIndex++;
387
388 if (matchesIndex < matches.size()) {
389 if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
390 ranges[rangesIndex].end = matches[matchesIndex];
391 matches.remove(matchesIndex);
392
393 mergeRangesFrom(ranges, rangesIndex);
394 } else
395 matchesIndex++;
396 }
397 }
398 };
399
400 coalesceMatchesAndRanges(m_matches, m_ranges);
401 coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
402
403 if (!m_matches.size() && !m_matchesUnicode.size()
404 && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
405 && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
406 && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
407 m_anyCharacter = true;
408 }
409
410 bool hasNonBMPCharacters()
411 {
412 return m_characterWidths & CharacterClassWidths::HasNonBMPChars;
413 }
414
415 CharacterClassWidths characterWidths()
416 {
417 return m_characterWidths;
418 }
419
420 bool anyCharacter()
421 {
422 return m_anyCharacter;
423 }
424
425 bool m_isCaseInsensitive : 1;
426 bool m_anyCharacter : 1;
427 CharacterClassWidths m_characterWidths;
428
429 CanonicalMode m_canonicalMode;
430
431 Vector<UChar32> m_matches;
432 Vector<CharacterRange> m_ranges;
433 Vector<UChar32> m_matchesUnicode;
434 Vector<CharacterRange> m_rangesUnicode;
435};
436
437class YarrPatternConstructor {
438public:
439 YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
440 : m_pattern(pattern)
441 , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
442 , m_stackLimit(stackLimit)
443 {
444 auto body = makeUnique<PatternDisjunction>();
445 m_pattern.m_body = body.get();
446 m_alternative = body->addNewAlternative();
447 m_pattern.m_disjunctions.append(WTFMove(body));
448 }
449
450 ~YarrPatternConstructor()
451 {
452 }
453
454 void resetForReparsing()
455 {
456 m_pattern.resetForReparsing();
457 m_characterClassConstructor.reset();
458
459 auto body = makeUnique<PatternDisjunction>();
460 m_pattern.m_body = body.get();
461 m_alternative = body->addNewAlternative();
462 m_pattern.m_disjunctions.append(WTFMove(body));
463 }
464
465 void saveUnmatchedNamedForwardReferences()
466 {
467 m_unmatchedNamedForwardReferences.shrink(0);
468
469 for (auto& entry : m_pattern.m_namedForwardReferences) {
470 if (!m_pattern.m_captureGroupNames.contains(entry))
471 m_unmatchedNamedForwardReferences.append(entry);
472 }
473 }
474
475 void assertionBOL()
476 {
477 if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
478 m_alternative->m_startsWithBOL = true;
479 m_alternative->m_containsBOL = true;
480 m_pattern.m_containsBOL = true;
481 }
482 m_alternative->m_terms.append(PatternTerm::BOL());
483 }
484 void assertionEOL()
485 {
486 m_alternative->m_terms.append(PatternTerm::EOL());
487 }
488 void assertionWordBoundary(bool invert)
489 {
490 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
491 }
492
493 void atomPatternCharacter(UChar32 ch)
494 {
495 // We handle case-insensitive checking of unicode characters which do have both
496 // cases by handling them as if they were defined using a CharacterClass.
497 if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
498 m_alternative->m_terms.append(PatternTerm(ch));
499 return;
500 }
501
502 const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
503 if (info->type == CanonicalizeUnique) {
504 m_alternative->m_terms.append(PatternTerm(ch));
505 return;
506 }
507
508 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
509 auto newCharacterClass = m_characterClassConstructor.charClass();
510 m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
511 m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
512 }
513
514 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
515 {
516 switch (classID) {
517 case BuiltInCharacterClassID::DigitClassID:
518 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
519 break;
520 case BuiltInCharacterClassID::SpaceClassID:
521 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
522 break;
523 case BuiltInCharacterClassID::WordClassID:
524 if (m_pattern.unicode() && m_pattern.ignoreCase())
525 m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
526 else
527 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
528 break;
529 case BuiltInCharacterClassID::DotClassID:
530 ASSERT(!invert);
531 if (m_pattern.dotAll())
532 m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
533 else
534 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true));
535 break;
536 default:
537 m_alternative->m_terms.append(PatternTerm(m_pattern.unicodeCharacterClassFor(classID), invert));
538 break;
539 }
540 }
541
542 void atomCharacterClassBegin(bool invert = false)
543 {
544 m_invertCharacterClass = invert;
545 }
546
547 void atomCharacterClassAtom(UChar32 ch)
548 {
549 m_characterClassConstructor.putChar(ch);
550 }
551
552 void atomCharacterClassRange(UChar32 begin, UChar32 end)
553 {
554 m_characterClassConstructor.putRange(begin, end);
555 }
556
557 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
558 {
559 ASSERT(classID != BuiltInCharacterClassID::DotClassID);
560
561 switch (classID) {
562 case BuiltInCharacterClassID::DigitClassID:
563 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
564 break;
565
566 case BuiltInCharacterClassID::SpaceClassID:
567 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
568 break;
569
570 case BuiltInCharacterClassID::WordClassID:
571 if (m_pattern.unicode() && m_pattern.ignoreCase())
572 m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
573 else
574 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
575 break;
576
577 default:
578 if (!invert)
579 m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID));
580 else
581 m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID));
582 }
583 }
584
585 void atomCharacterClassEnd()
586 {
587 auto newCharacterClass = m_characterClassConstructor.charClass();
588
589 if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
590 m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
591 return;
592 }
593 m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
594 m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
595 }
596
597 void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> optGroupName = WTF::nullopt)
598 {
599 unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
600 if (capture) {
601 m_pattern.m_numSubpatterns++;
602 if (optGroupName) {
603 while (m_pattern.m_captureGroupNames.size() < subpatternId)
604 m_pattern.m_captureGroupNames.append(String());
605 m_pattern.m_captureGroupNames.append(optGroupName.value());
606 m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
607 }
608 } else
609 ASSERT(!optGroupName);
610
611 auto parenthesesDisjunction = makeUnique<PatternDisjunction>(m_alternative);
612 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
613 m_alternative = parenthesesDisjunction->addNewAlternative();
614 m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
615 }
616
617 void atomParentheticalAssertionBegin(bool invert = false)
618 {
619 auto parenthesesDisjunction = makeUnique<PatternDisjunction>(m_alternative);
620 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
621 m_alternative = parenthesesDisjunction->addNewAlternative();
622 m_invertParentheticalAssertion = invert;
623 m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
624 }
625
626 void atomParenthesesEnd()
627 {
628 ASSERT(m_alternative->m_parent);
629 ASSERT(m_alternative->m_parent->m_parent);
630
631 PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
632 m_alternative = m_alternative->m_parent->m_parent;
633
634 PatternTerm& lastTerm = m_alternative->lastTerm();
635
636 unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
637 unsigned numBOLAnchoredAlts = 0;
638
639 for (unsigned i = 0; i < numParenAlternatives; i++) {
640 // Bubble up BOL flags
641 if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
642 numBOLAnchoredAlts++;
643 }
644
645 if (numBOLAnchoredAlts) {
646 m_alternative->m_containsBOL = true;
647 // If all the alternatives in parens start with BOL, then so does this one
648 if (numBOLAnchoredAlts == numParenAlternatives)
649 m_alternative->m_startsWithBOL = true;
650 }
651
652 lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
653 m_invertParentheticalAssertion = false;
654 }
655
656 void atomBackReference(unsigned subpatternId)
657 {
658 ASSERT(subpatternId);
659 m_pattern.m_containsBackreferences = true;
660 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
661
662 if (subpatternId > m_pattern.m_numSubpatterns) {
663 m_alternative->m_terms.append(PatternTerm::ForwardReference());
664 return;
665 }
666
667 PatternAlternative* currentAlternative = m_alternative;
668 ASSERT(currentAlternative);
669
670 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
671 while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
672 PatternTerm& term = currentAlternative->lastTerm();
673 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
674
675 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
676 m_alternative->m_terms.append(PatternTerm::ForwardReference());
677 return;
678 }
679 }
680
681 m_alternative->m_terms.append(PatternTerm(subpatternId));
682 }
683
684 void atomNamedBackReference(const String& subpatternName)
685 {
686 ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
687 atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
688 }
689
690 bool isValidNamedForwardReference(const String& subpatternName)
691 {
692 return !m_unmatchedNamedForwardReferences.contains(subpatternName);
693 }
694
695 void atomNamedForwardReference(const String& subpatternName)
696 {
697 m_pattern.m_namedForwardReferences.appendIfNotContains(subpatternName);
698 m_alternative->m_terms.append(PatternTerm::ForwardReference());
699 }
700
701 // deep copy the argument disjunction. If filterStartsWithBOL is true,
702 // skip alternatives with m_startsWithBOL set true.
703 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
704 {
705 if (UNLIKELY(!isSafeToRecurse())) {
706 m_error = ErrorCode::PatternTooLarge;
707 return 0;
708 }
709
710 std::unique_ptr<PatternDisjunction> newDisjunction;
711 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
712 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
713 if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
714 if (!newDisjunction) {
715 newDisjunction = makeUnique<PatternDisjunction>();
716 newDisjunction->m_parent = disjunction->m_parent;
717 }
718 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
719 newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
720 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
721 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
722 }
723 }
724
725 if (hasError(error())) {
726 newDisjunction = 0;
727 return 0;
728 }
729
730 if (!newDisjunction)
731 return 0;
732
733 PatternDisjunction* copiedDisjunction = newDisjunction.get();
734 m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
735 return copiedDisjunction;
736 }
737
738 PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
739 {
740 if (UNLIKELY(!isSafeToRecurse())) {
741 m_error = ErrorCode::PatternTooLarge;
742 return PatternTerm(term);
743 }
744
745 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
746 return PatternTerm(term);
747
748 PatternTerm termCopy = term;
749 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
750 m_pattern.m_hasCopiedParenSubexpressions = true;
751 return termCopy;
752 }
753
754 void quantifyAtom(unsigned min, unsigned max, bool greedy)
755 {
756 ASSERT(min <= max);
757 ASSERT(m_alternative->m_terms.size());
758
759 if (!max) {
760 m_alternative->removeLastTerm();
761 return;
762 }
763
764 PatternTerm& term = m_alternative->lastTerm();
765 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
766 ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
767
768 if (term.type == PatternTerm::TypeParentheticalAssertion) {
769 // If an assertion is quantified with a minimum count of zero, it can simply be removed.
770 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
771 // results in any input being consumed, however the continuation passed to the assertion
772 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
773 // reject all zero length matches (see step 2.1). A match from the continuation of the
774 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
775 // this is that matches from the assertion are not required, and won't be accepted anyway,
776 // so no need to ever run it.
777 if (!min)
778 m_alternative->removeLastTerm();
779 // We never need to run an assertion more than once. Subsequent interations will be run
780 // with the same start index (since assertions are non-capturing) and the same captures
781 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
782 // same result and captures. If the first match succeeds then the subsequent (min - 1)
783 // matches will too. Any additional optional matches will fail (on the same basis as the
784 // minimum zero quantified assertions, above), but this will still result in a match.
785 return;
786 }
787
788 if (min == max)
789 term.quantify(min, max, QuantifierFixedCount);
790 else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
791 term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
792 else {
793 term.quantify(min, min, QuantifierFixedCount);
794 m_alternative->m_terms.append(copyTerm(term));
795 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
796 m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
797 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
798 m_alternative->lastTerm().parentheses.isCopy = true;
799 }
800 }
801
802 void disjunction()
803 {
804 m_alternative = m_alternative->m_parent->addNewAlternative();
805 }
806
807 ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
808 {
809 if (UNLIKELY(!isSafeToRecurse()))
810 return ErrorCode::TooManyDisjunctions;
811
812 ErrorCode error = ErrorCode::NoError;
813 alternative->m_hasFixedSize = true;
814 Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
815
816 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
817 PatternTerm& term = alternative->m_terms[i];
818
819 switch (term.type) {
820 case PatternTerm::TypeAssertionBOL:
821 case PatternTerm::TypeAssertionEOL:
822 case PatternTerm::TypeAssertionWordBoundary:
823 term.inputPosition = currentInputPosition.unsafeGet();
824 break;
825
826 case PatternTerm::TypeBackReference:
827 term.inputPosition = currentInputPosition.unsafeGet();
828 term.frameLocation = currentCallFrameSize;
829 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
830 alternative->m_hasFixedSize = false;
831 break;
832
833 case PatternTerm::TypeForwardReference:
834 break;
835
836 case PatternTerm::TypePatternCharacter:
837 term.inputPosition = currentInputPosition.unsafeGet();
838 if (term.quantityType != QuantifierFixedCount) {
839 term.frameLocation = currentCallFrameSize;
840 currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
841 alternative->m_hasFixedSize = false;
842 } else if (m_pattern.unicode()) {
843 Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
844 tempCount *= U16_LENGTH(term.patternCharacter);
845 if (tempCount.hasOverflowed())
846 return ErrorCode::OffsetTooLarge;
847 currentInputPosition += tempCount;
848 } else
849 currentInputPosition += term.quantityMaxCount;
850 break;
851
852 case PatternTerm::TypeCharacterClass:
853 term.inputPosition = currentInputPosition.unsafeGet();
854 if (term.quantityType != QuantifierFixedCount) {
855 term.frameLocation = currentCallFrameSize;
856 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
857 alternative->m_hasFixedSize = false;
858 } else if (m_pattern.unicode()) {
859 term.frameLocation = currentCallFrameSize;
860 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
861 if (term.characterClass->hasOneCharacterSize() && !term.invert()) {
862 Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
863 tempCount *= term.characterClass->hasNonBMPCharacters() ? 2 : 1;
864 if (tempCount.hasOverflowed())
865 return ErrorCode::OffsetTooLarge;
866 currentInputPosition += tempCount;
867 } else {
868 currentInputPosition += term.quantityMaxCount;
869 alternative->m_hasFixedSize = false;
870 }
871 } else
872 currentInputPosition += term.quantityMaxCount;
873 break;
874
875 case PatternTerm::TypeParenthesesSubpattern:
876 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
877 term.frameLocation = currentCallFrameSize;
878 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
879 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
880 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
881 if (hasError(error))
882 return error;
883 // If quantity is fixed, then pre-check its minimum size.
884 if (term.quantityType == QuantifierFixedCount)
885 currentInputPosition += term.parentheses.disjunction->m_minimumSize;
886 term.inputPosition = currentInputPosition.unsafeGet();
887 } else if (term.parentheses.isTerminal) {
888 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
889 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
890 if (hasError(error))
891 return error;
892 term.inputPosition = currentInputPosition.unsafeGet();
893 } else {
894 term.inputPosition = currentInputPosition.unsafeGet();
895 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
896 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
897 if (hasError(error))
898 return error;
899 }
900 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
901 alternative->m_hasFixedSize = false;
902 break;
903
904 case PatternTerm::TypeParentheticalAssertion:
905 term.inputPosition = currentInputPosition.unsafeGet();
906 term.frameLocation = currentCallFrameSize;
907 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
908 if (hasError(error))
909 return error;
910 break;
911
912 case PatternTerm::TypeDotStarEnclosure:
913 ASSERT(!m_pattern.m_saveInitialStartValue);
914 alternative->m_hasFixedSize = false;
915 term.inputPosition = initialInputPosition;
916 m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
917 currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
918 m_pattern.m_saveInitialStartValue = true;
919 break;
920 }
921 if (currentInputPosition.hasOverflowed())
922 return ErrorCode::OffsetTooLarge;
923 }
924
925 alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
926 newCallFrameSize = currentCallFrameSize;
927 return error;
928 }
929
930 ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
931 {
932 if (UNLIKELY(!isSafeToRecurse()))
933 return ErrorCode::TooManyDisjunctions;
934
935 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
936 initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
937
938 unsigned minimumInputSize = UINT_MAX;
939 unsigned maximumCallFrameSize = 0;
940 bool hasFixedSize = true;
941 ErrorCode error = ErrorCode::NoError;
942
943 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
944 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
945 unsigned currentAlternativeCallFrameSize;
946 error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
947 if (hasError(error))
948 return error;
949 minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
950 maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
951 hasFixedSize &= alternative->m_hasFixedSize;
952 if (alternative->m_minimumSize > INT_MAX)
953 m_pattern.m_containsUnsignedLengthPattern = true;
954 }
955
956 ASSERT(minimumInputSize != UINT_MAX);
957 ASSERT(maximumCallFrameSize >= initialCallFrameSize);
958
959 disjunction->m_hasFixedSize = hasFixedSize;
960 disjunction->m_minimumSize = minimumInputSize;
961 disjunction->m_callFrameSize = maximumCallFrameSize;
962 callFrameSize = maximumCallFrameSize;
963 return error;
964 }
965
966 ErrorCode setupOffsets()
967 {
968 // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
969 unsigned ignoredCallFrameSize;
970 return setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize);
971 }
972
973 // This optimization identifies sets of parentheses that we will never need to backtrack.
974 // In these cases we do not need to store state from prior iterations.
975 // We can presently avoid backtracking for:
976 // * where the parens are at the end of the regular expression (last term in any of the
977 // alternatives of the main body disjunction).
978 // * where the parens are non-capturing, and quantified unbounded greedy (*).
979 // * where the parens do not contain any capturing subpatterns.
980 void checkForTerminalParentheses()
981 {
982 // This check is much too crude; should be just checking whether the candidate
983 // node contains nested capturing subpatterns, not the whole expression!
984 if (m_pattern.m_numSubpatterns)
985 return;
986
987 Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
988 for (size_t i = 0; i < alternatives.size(); ++i) {
989 Vector<PatternTerm>& terms = alternatives[i]->m_terms;
990 if (terms.size()) {
991 PatternTerm& term = terms.last();
992 if (term.type == PatternTerm::TypeParenthesesSubpattern
993 && term.quantityType == QuantifierGreedy
994 && term.quantityMinCount == 0
995 && term.quantityMaxCount == quantifyInfinite
996 && !term.capture())
997 term.parentheses.isTerminal = true;
998 }
999 }
1000 }
1001
1002 void optimizeBOL()
1003 {
1004 // Look for expressions containing beginning of line (^) anchoring and unroll them.
1005 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
1006 // This code relies on the parsing code tagging alternatives with m_containsBOL and
1007 // m_startsWithBOL and rolling those up to containing alternatives.
1008 // At this point, this is only valid for non-multiline expressions.
1009 PatternDisjunction* disjunction = m_pattern.m_body;
1010
1011 if (!m_pattern.m_containsBOL || m_pattern.multiline())
1012 return;
1013
1014 PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
1015
1016 // Set alternatives in disjunction to "onceThrough"
1017 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
1018 disjunction->m_alternatives[alt]->setOnceThrough();
1019
1020 if (loopDisjunction) {
1021 // Move alternatives from loopDisjunction to disjunction
1022 for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
1023 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
1024
1025 loopDisjunction->m_alternatives.clear();
1026 }
1027 }
1028
1029 bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
1030 {
1031 Vector<PatternTerm>& terms = alternative->m_terms;
1032
1033 ASSERT(endIndex <= terms.size());
1034 for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
1035 PatternTerm& term = terms[termIndex];
1036
1037 if (term.m_capture)
1038 return true;
1039
1040 if (term.type == PatternTerm::TypeParenthesesSubpattern) {
1041 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
1042 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
1043 if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
1044 return true;
1045 }
1046 }
1047 }
1048
1049 return false;
1050 }
1051
1052 // This optimization identifies alternatives in the form of
1053 // [^].*[?]<expression>.*[$] for expressions that don't have any
1054 // capturing terms. The alternative is changed to <expression>
1055 // followed by processing of the dot stars to find and adjust the
1056 // beginning and the end of the match.
1057 void optimizeDotStarWrappedExpressions()
1058 {
1059 Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
1060 if (alternatives.size() != 1)
1061 return;
1062
1063 CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
1064 PatternAlternative* alternative = alternatives[0].get();
1065 Vector<PatternTerm>& terms = alternative->m_terms;
1066 if (terms.size() >= 3) {
1067 bool startsWithBOL = false;
1068 bool endsWithEOL = false;
1069 size_t termIndex, firstExpressionTerm;
1070
1071 termIndex = 0;
1072 if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
1073 startsWithBOL = true;
1074 ++termIndex;
1075 }
1076
1077 PatternTerm& firstNonAnchorTerm = terms[termIndex];
1078 if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1079 || firstNonAnchorTerm.characterClass != dotCharacterClass
1080 || firstNonAnchorTerm.quantityMinCount
1081 || firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1082 return;
1083
1084 firstExpressionTerm = termIndex + 1;
1085
1086 termIndex = terms.size() - 1;
1087 if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
1088 endsWithEOL = true;
1089 --termIndex;
1090 }
1091
1092 PatternTerm& lastNonAnchorTerm = terms[termIndex];
1093 if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1094 || lastNonAnchorTerm.characterClass != dotCharacterClass
1095 || lastNonAnchorTerm.quantityType != QuantifierGreedy
1096 || lastNonAnchorTerm.quantityMinCount
1097 || lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1098 return;
1099
1100 size_t endIndex = termIndex;
1101 if (firstExpressionTerm >= endIndex)
1102 return;
1103
1104 if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
1105 for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
1106 terms.remove(termIndex);
1107
1108 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
1109 terms.remove(termIndex - 1);
1110
1111 terms.append(PatternTerm(startsWithBOL, endsWithEOL));
1112
1113 m_pattern.m_containsBOL = false;
1114 }
1115 }
1116 }
1117
1118 ErrorCode error() { return m_error; }
1119
1120private:
1121 bool isSafeToRecurse() const
1122 {
1123 if (!m_stackLimit)
1124 return true;
1125 int8_t* curr = reinterpret_cast<int8_t*>(currentStackPointer());
1126 int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
1127 return curr >= limit;
1128 }
1129
1130 YarrPattern& m_pattern;
1131 PatternAlternative* m_alternative;
1132 CharacterClassConstructor m_characterClassConstructor;
1133 Vector<String> m_unmatchedNamedForwardReferences;
1134 void* m_stackLimit;
1135 ErrorCode m_error { ErrorCode::NoError };
1136 bool m_invertCharacterClass;
1137 bool m_invertParentheticalAssertion { false };
1138};
1139
1140ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit)
1141{
1142 YarrPatternConstructor constructor(*this, stackLimit);
1143
1144 {
1145 ErrorCode error = parse(constructor, patternString, unicode());
1146 if (hasError(error))
1147 return error;
1148 }
1149
1150 // If the pattern contains illegal backreferences reset & reparse.
1151 // Quoting Netscape's "What's new in JavaScript 1.2",
1152 // "Note: if the number of left parentheses is less than the number specified
1153 // in \#, the \# is taken as an octal escape as described in the next row."
1154 if (containsIllegalBackReference() || containsIllegalNamedForwardReferences()) {
1155 if (unicode())
1156 return ErrorCode::InvalidBackreference;
1157
1158 unsigned numSubpatterns = m_numSubpatterns;
1159
1160 constructor.saveUnmatchedNamedForwardReferences();
1161 constructor.resetForReparsing();
1162 ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns);
1163 ASSERT_UNUSED(error, !hasError(error));
1164 ASSERT(numSubpatterns == m_numSubpatterns);
1165 }
1166
1167 constructor.checkForTerminalParentheses();
1168 constructor.optimizeDotStarWrappedExpressions();
1169 constructor.optimizeBOL();
1170
1171 if (hasError(constructor.error()))
1172 return constructor.error();
1173
1174 {
1175 ErrorCode error = constructor.setupOffsets();
1176 if (hasError(error))
1177 return error;
1178 }
1179
1180 if (Options::dumpCompiledRegExpPatterns())
1181 dumpPattern(patternString);
1182
1183 return ErrorCode::NoError;
1184}
1185
1186YarrPattern::YarrPattern(const String& pattern, OptionSet<Flags> flags, ErrorCode& error, void* stackLimit)
1187 : m_containsBackreferences(false)
1188 , m_containsBOL(false)
1189 , m_containsUnsignedLengthPattern(false)
1190 , m_hasCopiedParenSubexpressions(false)
1191 , m_saveInitialStartValue(false)
1192 , m_flags(flags)
1193{
1194 ASSERT(m_flags != Flags::DeletedValue);
1195 error = compile(pattern, stackLimit);
1196}
1197
1198void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1199{
1200 out.print(" ");
1201 for (; nestingDepth; --nestingDepth)
1202 out.print(" ");
1203}
1204
1205void dumpUChar32(PrintStream& out, UChar32 c)
1206{
1207 if (c >= ' '&& c <= 0xff)
1208 out.printf("'%c'", static_cast<char>(c));
1209 else
1210 out.printf("0x%04x", c);
1211}
1212
1213void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
1214{
1215 if (characterClass == pattern->anyCharacterClass())
1216 out.print("<any character>");
1217 else if (characterClass == pattern->newlineCharacterClass())
1218 out.print("<newline>");
1219 else if (characterClass == pattern->digitsCharacterClass())
1220 out.print("<digits>");
1221 else if (characterClass == pattern->spacesCharacterClass())
1222 out.print("<whitespace>");
1223 else if (characterClass == pattern->wordcharCharacterClass())
1224 out.print("<word>");
1225 else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
1226 out.print("<unicode word ignore case>");
1227 else if (characterClass == pattern->nondigitsCharacterClass())
1228 out.print("<non-digits>");
1229 else if (characterClass == pattern->nonspacesCharacterClass())
1230 out.print("<non-whitespace>");
1231 else if (characterClass == pattern->nonwordcharCharacterClass())
1232 out.print("<non-word>");
1233 else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1234 out.print("<unicode non-word ignore case>");
1235 else {
1236 bool needMatchesRangesSeperator = false;
1237
1238 auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1239 size_t matchesSize = matches.size();
1240 if (matchesSize) {
1241 if (needMatchesRangesSeperator)
1242 out.print(",");
1243 needMatchesRangesSeperator = true;
1244
1245 out.print(prefix, ":(");
1246 for (size_t i = 0; i < matchesSize; ++i) {
1247 if (i)
1248 out.print(",");
1249 dumpUChar32(out, matches[i]);
1250 }
1251 out.print(")");
1252 }
1253 };
1254
1255 auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1256 size_t rangeSize = ranges.size();
1257 if (rangeSize) {
1258 if (needMatchesRangesSeperator)
1259 out.print(",");
1260 needMatchesRangesSeperator = true;
1261
1262 out.print(prefix, " ranges:(");
1263 for (size_t i = 0; i < rangeSize; ++i) {
1264 if (i)
1265 out.print(",");
1266 CharacterRange range = ranges[i];
1267 out.print("(");
1268 dumpUChar32(out, range.begin);
1269 out.print("..");
1270 dumpUChar32(out, range.end);
1271 out.print(")");
1272 }
1273 out.print(")");
1274 }
1275 };
1276
1277 out.print("[");
1278 dumpMatches("ASCII", characterClass->m_matches);
1279 dumpRanges("ASCII", characterClass->m_ranges);
1280 dumpMatches("Unicode", characterClass->m_matchesUnicode);
1281 dumpRanges("Unicode", characterClass->m_rangesUnicode);
1282 out.print("]");
1283 }
1284}
1285
1286void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1287{
1288 out.print("minimum size: ", m_minimumSize);
1289 if (m_hasFixedSize)
1290 out.print(",fixed size");
1291 if (m_onceThrough)
1292 out.print(",once through");
1293 if (m_startsWithBOL)
1294 out.print(",starts with ^");
1295 if (m_containsBOL)
1296 out.print(",contains ^");
1297 out.print("\n");
1298
1299 for (size_t i = 0; i < m_terms.size(); ++i)
1300 m_terms[i].dump(out, thisPattern, nestingDepth);
1301}
1302
1303void PatternTerm::dumpQuantifier(PrintStream& out)
1304{
1305 if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1)
1306 return;
1307 out.print(" {", quantityMinCount.unsafeGet());
1308 if (quantityMinCount != quantityMaxCount) {
1309 if (quantityMaxCount == UINT_MAX)
1310 out.print(",...");
1311 else
1312 out.print(",", quantityMaxCount.unsafeGet());
1313 }
1314 out.print("}");
1315 if (quantityType == QuantifierGreedy)
1316 out.print(" greedy");
1317 else if (quantityType == QuantifierNonGreedy)
1318 out.print(" non-greedy");
1319}
1320
1321void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1322{
1323 indentForNestingLevel(out, nestingDepth);
1324
1325 if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
1326 if (invert())
1327 out.print("not ");
1328 }
1329
1330 switch (type) {
1331 case TypeAssertionBOL:
1332 out.println("BOL");
1333 break;
1334 case TypeAssertionEOL:
1335 out.println("EOL");
1336 break;
1337 case TypeAssertionWordBoundary:
1338 out.println("word boundary");
1339 break;
1340 case TypePatternCharacter:
1341 out.printf("character ");
1342 out.printf("inputPosition %u ", inputPosition);
1343 if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1344 dumpUChar32(out, toASCIIUpper(patternCharacter));
1345 out.print("/");
1346 dumpUChar32(out, toASCIILower(patternCharacter));
1347 } else
1348 dumpUChar32(out, patternCharacter);
1349 dumpQuantifier(out);
1350 if (quantityType != QuantifierFixedCount)
1351 out.print(",frame location ", frameLocation);
1352 out.println();
1353 break;
1354 case TypeCharacterClass:
1355 out.print("character class ");
1356 out.printf("inputPosition %u ", inputPosition);
1357 dumpCharacterClass(out, thisPattern, characterClass);
1358 dumpQuantifier(out);
1359 if (quantityType != QuantifierFixedCount || thisPattern->unicode())
1360 out.print(",frame location ", frameLocation);
1361 out.println();
1362 break;
1363 case TypeBackReference:
1364 out.print("back reference to subpattern #", backReferenceSubpatternId);
1365 out.println(",frame location ", frameLocation);
1366 break;
1367 case TypeForwardReference:
1368 out.println("forward reference");
1369 break;
1370 case TypeParenthesesSubpattern:
1371 if (m_capture)
1372 out.print("captured ");
1373 else
1374 out.print("non-captured ");
1375
1376 FALLTHROUGH;
1377 case TypeParentheticalAssertion:
1378 if (m_invert)
1379 out.print("inverted ");
1380
1381 if (type == TypeParenthesesSubpattern)
1382 out.print("subpattern");
1383 else if (type == TypeParentheticalAssertion)
1384 out.print("assertion");
1385
1386 if (m_capture)
1387 out.print(" #", parentheses.subpatternId);
1388
1389 dumpQuantifier(out);
1390
1391 if (parentheses.isCopy)
1392 out.print(",copy");
1393
1394 if (parentheses.isTerminal)
1395 out.print(",terminal");
1396
1397 out.println(",frame location ", frameLocation);
1398
1399 if (parentheses.disjunction->m_alternatives.size() > 1) {
1400 indentForNestingLevel(out, nestingDepth + 1);
1401 unsigned alternativeFrameLocation = frameLocation;
1402 if (quantityMaxCount == 1 && !parentheses.isCopy)
1403 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1404 else if (parentheses.isTerminal)
1405 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1406 else
1407 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
1408 out.println("alternative list,frame location ", alternativeFrameLocation);
1409 }
1410
1411 parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1);
1412 break;
1413 case TypeDotStarEnclosure:
1414 out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1415 break;
1416 }
1417}
1418
1419void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0)
1420{
1421 unsigned alternativeCount = m_alternatives.size();
1422 for (unsigned i = 0; i < alternativeCount; ++i) {
1423 indentForNestingLevel(out, nestingDepth);
1424 if (alternativeCount > 1)
1425 out.print("alternative #", i, ": ");
1426 m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1));
1427 }
1428}
1429
1430void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString)
1431{
1432 out.print("/", patternString, "/");
1433
1434 if (global())
1435 out.print("g");
1436 if (ignoreCase())
1437 out.print("i");
1438 if (multiline())
1439 out.print("m");
1440 if (unicode())
1441 out.print("u");
1442 if (sticky())
1443 out.print("y");
1444}
1445
1446void YarrPattern::dumpPattern(const String& patternString)
1447{
1448 dumpPattern(WTF::dataFile(), patternString);
1449}
1450
1451void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1452{
1453 out.print("RegExp pattern for ");
1454 dumpPatternString(out, patternString);
1455
1456 if (m_flags) {
1457 bool printSeparator = false;
1458 out.print(" (");
1459 if (global()) {
1460 out.print("global");
1461 printSeparator = true;
1462 }
1463 if (ignoreCase()) {
1464 if (printSeparator)
1465 out.print("|");
1466 out.print("ignore case");
1467 printSeparator = true;
1468 }
1469 if (multiline()) {
1470 if (printSeparator)
1471 out.print("|");
1472 out.print("multiline");
1473 printSeparator = true;
1474 }
1475 if (unicode()) {
1476 if (printSeparator)
1477 out.print("|");
1478 out.print("unicode");
1479 printSeparator = true;
1480 }
1481 if (sticky()) {
1482 if (printSeparator)
1483 out.print("|");
1484 out.print("sticky");
1485 }
1486 out.print(")");
1487 }
1488 out.print(":\n");
1489 if (m_body->m_callFrameSize)
1490 out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
1491 m_body->dump(out, this);
1492}
1493
1494std::unique_ptr<CharacterClass> anycharCreate()
1495{
1496 auto characterClass = makeUnique<CharacterClass>();
1497 characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
1498 characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
1499 characterClass->m_characterWidths = CharacterClassWidths::HasBothBMPAndNonBMP;
1500 characterClass->m_anyCharacter = true;
1501 return characterClass;
1502}
1503
1504} } // namespace JSC::Yarr
1505