1/*
2 * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrPattern.h"
29
30#include "Options.h"
31#include "Yarr.h"
32#include "YarrCanonicalize.h"
33#include "YarrParser.h"
34#include <wtf/DataLog.h>
35#include <wtf/Optional.h>
36#include <wtf/StackPointer.h>
37#include <wtf/Threading.h>
38#include <wtf/Vector.h>
39
40namespace JSC { namespace Yarr {
41
42#include "RegExpJitTables.h"
43
44class CharacterClassConstructor {
45public:
46 CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
47 : m_isCaseInsensitive(isCaseInsensitive)
48 , m_anyCharacter(false)
49 , m_characterWidths(CharacterClassWidths::Unknown)
50 , m_canonicalMode(canonicalMode)
51 {
52 }
53
54 void reset()
55 {
56 m_matches.clear();
57 m_ranges.clear();
58 m_matchesUnicode.clear();
59 m_rangesUnicode.clear();
60 m_anyCharacter = false;
61 m_characterWidths = CharacterClassWidths::Unknown;
62 }
63
64 void append(const CharacterClass* other)
65 {
66 for (size_t i = 0; i < other->m_matches.size(); ++i)
67 addSorted(m_matches, other->m_matches[i]);
68 for (size_t i = 0; i < other->m_ranges.size(); ++i)
69 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
70 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
71 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
72 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
73 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
74 }
75
76 void appendInverted(const CharacterClass* other)
77 {
78 auto addSortedInverted = [&](UChar32 min, UChar32 max,
79 const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
80 Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
81
82 auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
83 if (lo < hiPlusOne) {
84 if (lo + 1 == hiPlusOne)
85 addSorted(destMatches, lo);
86 else
87 addSortedRange(destRanges, lo, hiPlusOne - 1);
88 }
89 };
90
91 UChar32 lo = min;
92 size_t matchesIndex = 0;
93 size_t rangesIndex = 0;
94 bool matchesRemaining = matchesIndex < srcMatches.size();
95 bool rangesRemaining = rangesIndex < srcRanges.size();
96
97 if (!matchesRemaining && !rangesRemaining) {
98 addSortedMatchOrRange(min, max + 1);
99 return;
100 }
101
102 while (matchesRemaining || rangesRemaining) {
103 UChar32 hiPlusOne;
104 UChar32 nextLo;
105
106 if (matchesRemaining
107 && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) {
108 hiPlusOne = srcMatches[matchesIndex];
109 nextLo = hiPlusOne + 1;
110 ++matchesIndex;
111 matchesRemaining = matchesIndex < srcMatches.size();
112 } else {
113 hiPlusOne = srcRanges[rangesIndex].begin;
114 nextLo = srcRanges[rangesIndex].end + 1;
115 ++rangesIndex;
116 rangesRemaining = rangesIndex < srcRanges.size();
117 }
118
119 addSortedMatchOrRange(lo, hiPlusOne);
120
121 lo = nextLo;
122 }
123
124 addSortedMatchOrRange(lo, max + 1);
125 };
126
127 addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges);
128 addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
129 }
130
131 void putChar(UChar32 ch)
132 {
133 if (!m_isCaseInsensitive) {
134 addSorted(ch);
135 return;
136 }
137
138 if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
139 // Handle ASCII cases.
140 if (isASCIIAlpha(ch)) {
141 addSorted(m_matches, toASCIIUpper(ch));
142 addSorted(m_matches, toASCIILower(ch));
143 } else
144 addSorted(m_matches, ch);
145 return;
146 }
147
148 // Add multiple matches, if necessary.
149 const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
150 if (info->type == CanonicalizeUnique)
151 addSorted(ch);
152 else
153 putUnicodeIgnoreCase(ch, info);
154 }
155
156 void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
157 {
158 ASSERT(m_isCaseInsensitive);
159 ASSERT(ch >= info->begin && ch <= info->end);
160 ASSERT(info->type != CanonicalizeUnique);
161 if (info->type == CanonicalizeSet) {
162 for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
163 addSorted(ch);
164 } else {
165 addSorted(ch);
166 addSorted(getCanonicalPair(info, ch));
167 }
168 }
169
170 void putRange(UChar32 lo, UChar32 hi)
171 {
172 if (isASCII(lo)) {
173 char asciiLo = lo;
174 char asciiHi = std::min(hi, (UChar32)0x7f);
175 addSortedRange(m_ranges, lo, asciiHi);
176
177 if (m_isCaseInsensitive) {
178 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
179 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
180 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
181 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
182 }
183 }
184 if (isASCII(hi))
185 return;
186
187 lo = std::max(lo, (UChar32)0x80);
188 addSortedRange(m_rangesUnicode, lo, hi);
189
190 if (!m_isCaseInsensitive)
191 return;
192
193 const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
194 while (true) {
195 // Handle the range [lo .. end]
196 UChar32 end = std::min<UChar32>(info->end, hi);
197
198 switch (info->type) {
199 case CanonicalizeUnique:
200 // Nothing to do - no canonical equivalents.
201 break;
202 case CanonicalizeSet: {
203 UChar ch;
204 for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
205 addSorted(m_matchesUnicode, ch);
206 break;
207 }
208 case CanonicalizeRangeLo:
209 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
210 break;
211 case CanonicalizeRangeHi:
212 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
213 break;
214 case CanonicalizeAlternatingAligned:
215 // Use addSortedRange since there is likely an abutting range to combine with.
216 if (lo & 1)
217 addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
218 if (!(end & 1))
219 addSortedRange(m_rangesUnicode, end + 1, end + 1);
220 break;
221 case CanonicalizeAlternatingUnaligned:
222 // Use addSortedRange since there is likely an abutting range to combine with.
223 if (!(lo & 1))
224 addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
225 if (end & 1)
226 addSortedRange(m_rangesUnicode, end + 1, end + 1);
227 break;
228 }
229
230 if (hi == end)
231 return;
232
233 ++info;
234 lo = info->begin;
235 };
236
237 }
238
239 std::unique_ptr<CharacterClass> charClass()
240 {
241 coalesceTables();
242
243 auto characterClass = std::make_unique<CharacterClass>();
244
245 characterClass->m_matches.swap(m_matches);
246 characterClass->m_ranges.swap(m_ranges);
247 characterClass->m_matchesUnicode.swap(m_matchesUnicode);
248 characterClass->m_rangesUnicode.swap(m_rangesUnicode);
249 characterClass->m_anyCharacter = anyCharacter();
250 characterClass->m_characterWidths = characterWidths();
251
252 m_anyCharacter = false;
253 m_characterWidths = CharacterClassWidths::Unknown;
254
255 return characterClass;
256 }
257
258private:
259 void addSorted(UChar32 ch)
260 {
261 addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
262 }
263
264 void addSorted(Vector<UChar32>& matches, UChar32 ch)
265 {
266 unsigned pos = 0;
267 unsigned range = matches.size();
268
269 m_characterWidths |= (U_IS_BMP(ch) ? CharacterClassWidths::HasBMPChars : CharacterClassWidths::HasNonBMPChars);
270
271 // binary chop, find position to insert char.
272 while (range) {
273 unsigned index = range >> 1;
274
275 int val = matches[pos+index] - ch;
276 if (!val)
277 return;
278 else if (val > 0) {
279 if (val == 1) {
280 UChar32 lo = ch;
281 UChar32 hi = ch + 1;
282 matches.remove(pos + index);
283 if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
284 lo = ch - 1;
285 matches.remove(pos + index - 1);
286 }
287 addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
288 return;
289 }
290 range = index;
291 } else {
292 if (val == -1) {
293 UChar32 lo = ch - 1;
294 UChar32 hi = ch;
295 matches.remove(pos + index);
296 if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
297 hi = ch + 1;
298 matches.remove(pos + index + 1);
299 }
300 addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
301 return;
302 }
303 pos += (index+1);
304 range -= (index+1);
305 }
306 }
307
308 if (pos == matches.size())
309 matches.append(ch);
310 else
311 matches.insert(pos, ch);
312 }
313
314 void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
315 {
316 size_t end = ranges.size();
317
318 if (U_IS_BMP(lo))
319 m_characterWidths |= CharacterClassWidths::HasBMPChars;
320 if (!U_IS_BMP(hi))
321 m_characterWidths |= CharacterClassWidths::HasNonBMPChars;
322
323 // Simple linear scan - I doubt there are that many ranges anyway...
324 // feel free to fix this with something faster (eg binary chop).
325 for (size_t i = 0; i < end; ++i) {
326 // does the new range fall before the current position in the array
327 if (hi < ranges[i].begin) {
328 // Concatenate appending ranges.
329 if (hi == (ranges[i].begin - 1)) {
330 ranges[i].begin = lo;
331 return;
332 }
333 ranges.insert(i, CharacterRange(lo, hi));
334 return;
335 }
336 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
337 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
338 // end of the last range they concatenate, which is just as good.
339 if (lo <= (ranges[i].end + 1)) {
340 // found an intersect! we'll replace this entry in the array.
341 ranges[i].begin = std::min(ranges[i].begin, lo);
342 ranges[i].end = std::max(ranges[i].end, hi);
343
344 mergeRangesFrom(ranges, i);
345 return;
346 }
347 }
348
349 // CharacterRange comes after all existing ranges.
350 ranges.append(CharacterRange(lo, hi));
351 }
352
353 void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
354 {
355 unsigned next = index + 1;
356
357 // each iteration of the loop we will either remove something from the list, or break out of the loop.
358 while (next < ranges.size()) {
359 if (ranges[next].begin <= (ranges[index].end + 1)) {
360 // the next entry now overlaps / concatenates with this one.
361 ranges[index].end = std::max(ranges[index].end, ranges[next].end);
362 ranges.remove(next);
363 } else
364 break;
365 }
366
367 }
368
369 void coalesceTables()
370 {
371 auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
372
373 size_t matchesIndex = 0;
374 size_t rangesIndex = 0;
375
376 while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
377 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
378 matchesIndex++;
379
380 if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
381 ranges[rangesIndex].begin = matches[matchesIndex];
382 matches.remove(matchesIndex);
383 }
384
385 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
386 matchesIndex++;
387
388 if (matchesIndex < matches.size()) {
389 if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
390 ranges[rangesIndex].end = matches[matchesIndex];
391 matches.remove(matchesIndex);
392
393 mergeRangesFrom(ranges, rangesIndex);
394 } else
395 matchesIndex++;
396 }
397 }
398 };
399
400 coalesceMatchesAndRanges(m_matches, m_ranges);
401 coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
402
403 if (!m_matches.size() && !m_matchesUnicode.size()
404 && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
405 && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
406 && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
407 m_anyCharacter = true;
408 }
409
410 bool hasNonBMPCharacters()
411 {
412 return m_characterWidths & CharacterClassWidths::HasNonBMPChars;
413 }
414
415 CharacterClassWidths characterWidths()
416 {
417 return m_characterWidths;
418 }
419
420 bool anyCharacter()
421 {
422 return m_anyCharacter;
423 }
424
425 bool m_isCaseInsensitive : 1;
426 bool m_anyCharacter : 1;
427 CharacterClassWidths m_characterWidths;
428
429 CanonicalMode m_canonicalMode;
430
431 Vector<UChar32> m_matches;
432 Vector<CharacterRange> m_ranges;
433 Vector<UChar32> m_matchesUnicode;
434 Vector<CharacterRange> m_rangesUnicode;
435};
436
437class YarrPatternConstructor {
438public:
439 YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
440 : m_pattern(pattern)
441 , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
442 , m_stackLimit(stackLimit)
443 {
444 auto body = std::make_unique<PatternDisjunction>();
445 m_pattern.m_body = body.get();
446 m_alternative = body->addNewAlternative();
447 m_pattern.m_disjunctions.append(WTFMove(body));
448 }
449
450 ~YarrPatternConstructor()
451 {
452 }
453
454 void resetForReparsing()
455 {
456 m_pattern.resetForReparsing();
457 m_characterClassConstructor.reset();
458
459 auto body = std::make_unique<PatternDisjunction>();
460 m_pattern.m_body = body.get();
461 m_alternative = body->addNewAlternative();
462 m_pattern.m_disjunctions.append(WTFMove(body));
463 }
464
465 void saveUnmatchedNamedForwardReferences()
466 {
467 m_unmatchedNamedForwardReferences.shrink(0);
468
469 for (auto& entry : m_pattern.m_namedForwardReferences) {
470 if (!m_pattern.m_captureGroupNames.contains(entry))
471 m_unmatchedNamedForwardReferences.append(entry);
472 }
473 }
474
475 void assertionBOL()
476 {
477 if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
478 m_alternative->m_startsWithBOL = true;
479 m_alternative->m_containsBOL = true;
480 m_pattern.m_containsBOL = true;
481 }
482 m_alternative->m_terms.append(PatternTerm::BOL());
483 }
484 void assertionEOL()
485 {
486 m_alternative->m_terms.append(PatternTerm::EOL());
487 }
488 void assertionWordBoundary(bool invert)
489 {
490 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
491 }
492
493 void atomPatternCharacter(UChar32 ch)
494 {
495 // We handle case-insensitive checking of unicode characters which do have both
496 // cases by handling them as if they were defined using a CharacterClass.
497 if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
498 m_alternative->m_terms.append(PatternTerm(ch));
499 return;
500 }
501
502 const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
503 if (info->type == CanonicalizeUnique) {
504 m_alternative->m_terms.append(PatternTerm(ch));
505 return;
506 }
507
508 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
509 auto newCharacterClass = m_characterClassConstructor.charClass();
510 m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
511 m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
512 }
513
514 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
515 {
516 switch (classID) {
517 case BuiltInCharacterClassID::DigitClassID:
518 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
519 break;
520 case BuiltInCharacterClassID::SpaceClassID:
521 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
522 break;
523 case BuiltInCharacterClassID::WordClassID:
524 if (m_pattern.unicode() && m_pattern.ignoreCase())
525 m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
526 else
527 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
528 break;
529 case BuiltInCharacterClassID::DotClassID:
530 ASSERT(!invert);
531 if (m_pattern.dotAll())
532 m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
533 else
534 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true));
535 break;
536 default:
537 m_alternative->m_terms.append(PatternTerm(m_pattern.unicodeCharacterClassFor(classID), invert));
538 break;
539 }
540 }
541
542 void atomCharacterClassBegin(bool invert = false)
543 {
544 m_invertCharacterClass = invert;
545 }
546
547 void atomCharacterClassAtom(UChar32 ch)
548 {
549 m_characterClassConstructor.putChar(ch);
550 }
551
552 void atomCharacterClassRange(UChar32 begin, UChar32 end)
553 {
554 m_characterClassConstructor.putRange(begin, end);
555 }
556
557 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
558 {
559 ASSERT(classID != BuiltInCharacterClassID::DotClassID);
560
561 switch (classID) {
562 case BuiltInCharacterClassID::DigitClassID:
563 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
564 break;
565
566 case BuiltInCharacterClassID::SpaceClassID:
567 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
568 break;
569
570 case BuiltInCharacterClassID::WordClassID:
571 if (m_pattern.unicode() && m_pattern.ignoreCase())
572 m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
573 else
574 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
575 break;
576
577 default:
578 if (!invert)
579 m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID));
580 else
581 m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID));
582 }
583 }
584
585 void atomCharacterClassEnd()
586 {
587 auto newCharacterClass = m_characterClassConstructor.charClass();
588
589 if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
590 m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
591 return;
592 }
593 m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
594 m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
595 }
596
597 void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> optGroupName = WTF::nullopt)
598 {
599 unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
600 if (capture) {
601 m_pattern.m_numSubpatterns++;
602 if (optGroupName) {
603 while (m_pattern.m_captureGroupNames.size() < subpatternId)
604 m_pattern.m_captureGroupNames.append(String());
605 m_pattern.m_captureGroupNames.append(optGroupName.value());
606 m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
607 }
608 } else
609 ASSERT(!optGroupName);
610
611 auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
612 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
613 m_alternative = parenthesesDisjunction->addNewAlternative();
614 m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
615 }
616
617 void atomParentheticalAssertionBegin(bool invert = false)
618 {
619 auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
620 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
621 m_alternative = parenthesesDisjunction->addNewAlternative();
622 m_invertParentheticalAssertion = invert;
623 m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
624 }
625
626 void atomParenthesesEnd()
627 {
628 ASSERT(m_alternative->m_parent);
629 ASSERT(m_alternative->m_parent->m_parent);
630
631 PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
632 m_alternative = m_alternative->m_parent->m_parent;
633
634 PatternTerm& lastTerm = m_alternative->lastTerm();
635
636 unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
637 unsigned numBOLAnchoredAlts = 0;
638
639 for (unsigned i = 0; i < numParenAlternatives; i++) {
640 // Bubble up BOL flags
641 if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
642 numBOLAnchoredAlts++;
643 }
644
645 if (numBOLAnchoredAlts) {
646 m_alternative->m_containsBOL = true;
647 // If all the alternatives in parens start with BOL, then so does this one
648 if (numBOLAnchoredAlts == numParenAlternatives)
649 m_alternative->m_startsWithBOL = true;
650 }
651
652 lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
653 m_invertParentheticalAssertion = false;
654 }
655
656 void atomBackReference(unsigned subpatternId)
657 {
658 ASSERT(subpatternId);
659 m_pattern.m_containsBackreferences = true;
660 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
661
662 if (subpatternId > m_pattern.m_numSubpatterns) {
663 m_alternative->m_terms.append(PatternTerm::ForwardReference());
664 return;
665 }
666
667 PatternAlternative* currentAlternative = m_alternative;
668 ASSERT(currentAlternative);
669
670 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
671 while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
672 PatternTerm& term = currentAlternative->lastTerm();
673 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
674
675 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
676 m_alternative->m_terms.append(PatternTerm::ForwardReference());
677 return;
678 }
679 }
680
681 m_alternative->m_terms.append(PatternTerm(subpatternId));
682 }
683
684 void atomNamedBackReference(const String& subpatternName)
685 {
686 ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
687 atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
688 }
689
690 bool isValidNamedForwardReference(const String& subpatternName)
691 {
692 return !m_unmatchedNamedForwardReferences.contains(subpatternName);
693 }
694
695 void atomNamedForwardReference(const String& subpatternName)
696 {
697 m_pattern.m_namedForwardReferences.appendIfNotContains(subpatternName);
698 m_alternative->m_terms.append(PatternTerm::ForwardReference());
699 }
700
701 // deep copy the argument disjunction. If filterStartsWithBOL is true,
702 // skip alternatives with m_startsWithBOL set true.
703 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
704 {
705 std::unique_ptr<PatternDisjunction> newDisjunction;
706 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
707 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
708 if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
709 if (!newDisjunction) {
710 newDisjunction = std::make_unique<PatternDisjunction>();
711 newDisjunction->m_parent = disjunction->m_parent;
712 }
713 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
714 newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
715 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
716 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
717 }
718 }
719
720 if (!newDisjunction)
721 return 0;
722
723 PatternDisjunction* copiedDisjunction = newDisjunction.get();
724 m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
725 return copiedDisjunction;
726 }
727
728 PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
729 {
730 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
731 return PatternTerm(term);
732
733 PatternTerm termCopy = term;
734 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
735 m_pattern.m_hasCopiedParenSubexpressions = true;
736 return termCopy;
737 }
738
739 void quantifyAtom(unsigned min, unsigned max, bool greedy)
740 {
741 ASSERT(min <= max);
742 ASSERT(m_alternative->m_terms.size());
743
744 if (!max) {
745 m_alternative->removeLastTerm();
746 return;
747 }
748
749 PatternTerm& term = m_alternative->lastTerm();
750 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
751 ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
752
753 if (term.type == PatternTerm::TypeParentheticalAssertion) {
754 // If an assertion is quantified with a minimum count of zero, it can simply be removed.
755 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
756 // results in any input being consumed, however the continuation passed to the assertion
757 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
758 // reject all zero length matches (see step 2.1). A match from the continuation of the
759 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
760 // this is that matches from the assertion are not required, and won't be accepted anyway,
761 // so no need to ever run it.
762 if (!min)
763 m_alternative->removeLastTerm();
764 // We never need to run an assertion more than once. Subsequent interations will be run
765 // with the same start index (since assertions are non-capturing) and the same captures
766 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
767 // same result and captures. If the first match succeeds then the subsequent (min - 1)
768 // matches will too. Any additional optional matches will fail (on the same basis as the
769 // minimum zero quantified assertions, above), but this will still result in a match.
770 return;
771 }
772
773 if (min == max)
774 term.quantify(min, max, QuantifierFixedCount);
775 else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
776 term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
777 else {
778 term.quantify(min, min, QuantifierFixedCount);
779 m_alternative->m_terms.append(copyTerm(term));
780 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
781 m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
782 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
783 m_alternative->lastTerm().parentheses.isCopy = true;
784 }
785 }
786
787 void disjunction()
788 {
789 m_alternative = m_alternative->m_parent->addNewAlternative();
790 }
791
792 ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
793 {
794 if (UNLIKELY(!isSafeToRecurse()))
795 return ErrorCode::TooManyDisjunctions;
796
797 ErrorCode error = ErrorCode::NoError;
798 alternative->m_hasFixedSize = true;
799 Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
800
801 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
802 PatternTerm& term = alternative->m_terms[i];
803
804 switch (term.type) {
805 case PatternTerm::TypeAssertionBOL:
806 case PatternTerm::TypeAssertionEOL:
807 case PatternTerm::TypeAssertionWordBoundary:
808 term.inputPosition = currentInputPosition.unsafeGet();
809 break;
810
811 case PatternTerm::TypeBackReference:
812 term.inputPosition = currentInputPosition.unsafeGet();
813 term.frameLocation = currentCallFrameSize;
814 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
815 alternative->m_hasFixedSize = false;
816 break;
817
818 case PatternTerm::TypeForwardReference:
819 break;
820
821 case PatternTerm::TypePatternCharacter:
822 term.inputPosition = currentInputPosition.unsafeGet();
823 if (term.quantityType != QuantifierFixedCount) {
824 term.frameLocation = currentCallFrameSize;
825 currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
826 alternative->m_hasFixedSize = false;
827 } else if (m_pattern.unicode()) {
828 Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
829 tempCount *= U16_LENGTH(term.patternCharacter);
830 if (tempCount.hasOverflowed())
831 return ErrorCode::OffsetTooLarge;
832 currentInputPosition += tempCount;
833 } else
834 currentInputPosition += term.quantityMaxCount;
835 break;
836
837 case PatternTerm::TypeCharacterClass:
838 term.inputPosition = currentInputPosition.unsafeGet();
839 if (term.quantityType != QuantifierFixedCount) {
840 term.frameLocation = currentCallFrameSize;
841 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
842 alternative->m_hasFixedSize = false;
843 } else if (m_pattern.unicode()) {
844 term.frameLocation = currentCallFrameSize;
845 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
846 if (term.characterClass->hasOneCharacterSize() && !term.invert()) {
847 Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
848 tempCount *= term.characterClass->hasNonBMPCharacters() ? 2 : 1;
849 if (tempCount.hasOverflowed())
850 return ErrorCode::OffsetTooLarge;
851 currentInputPosition += tempCount;
852 } else {
853 currentInputPosition += term.quantityMaxCount;
854 alternative->m_hasFixedSize = false;
855 }
856 } else
857 currentInputPosition += term.quantityMaxCount;
858 break;
859
860 case PatternTerm::TypeParenthesesSubpattern:
861 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
862 term.frameLocation = currentCallFrameSize;
863 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
864 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
865 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
866 if (hasError(error))
867 return error;
868 // If quantity is fixed, then pre-check its minimum size.
869 if (term.quantityType == QuantifierFixedCount)
870 currentInputPosition += term.parentheses.disjunction->m_minimumSize;
871 term.inputPosition = currentInputPosition.unsafeGet();
872 } else if (term.parentheses.isTerminal) {
873 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
874 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
875 if (hasError(error))
876 return error;
877 term.inputPosition = currentInputPosition.unsafeGet();
878 } else {
879 term.inputPosition = currentInputPosition.unsafeGet();
880 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
881 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
882 if (hasError(error))
883 return error;
884 }
885 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
886 alternative->m_hasFixedSize = false;
887 break;
888
889 case PatternTerm::TypeParentheticalAssertion:
890 term.inputPosition = currentInputPosition.unsafeGet();
891 term.frameLocation = currentCallFrameSize;
892 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
893 if (hasError(error))
894 return error;
895 break;
896
897 case PatternTerm::TypeDotStarEnclosure:
898 ASSERT(!m_pattern.m_saveInitialStartValue);
899 alternative->m_hasFixedSize = false;
900 term.inputPosition = initialInputPosition;
901 m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
902 currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
903 m_pattern.m_saveInitialStartValue = true;
904 break;
905 }
906 if (currentInputPosition.hasOverflowed())
907 return ErrorCode::OffsetTooLarge;
908 }
909
910 alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
911 newCallFrameSize = currentCallFrameSize;
912 return error;
913 }
914
915 ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
916 {
917 if (UNLIKELY(!isSafeToRecurse()))
918 return ErrorCode::TooManyDisjunctions;
919
920 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
921 initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
922
923 unsigned minimumInputSize = UINT_MAX;
924 unsigned maximumCallFrameSize = 0;
925 bool hasFixedSize = true;
926 ErrorCode error = ErrorCode::NoError;
927
928 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
929 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
930 unsigned currentAlternativeCallFrameSize;
931 error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
932 if (hasError(error))
933 return error;
934 minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
935 maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
936 hasFixedSize &= alternative->m_hasFixedSize;
937 if (alternative->m_minimumSize > INT_MAX)
938 m_pattern.m_containsUnsignedLengthPattern = true;
939 }
940
941 ASSERT(minimumInputSize != UINT_MAX);
942 ASSERT(maximumCallFrameSize >= initialCallFrameSize);
943
944 disjunction->m_hasFixedSize = hasFixedSize;
945 disjunction->m_minimumSize = minimumInputSize;
946 disjunction->m_callFrameSize = maximumCallFrameSize;
947 callFrameSize = maximumCallFrameSize;
948 return error;
949 }
950
951 ErrorCode setupOffsets()
952 {
953 // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
954 unsigned ignoredCallFrameSize;
955 return setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize);
956 }
957
958 // This optimization identifies sets of parentheses that we will never need to backtrack.
959 // In these cases we do not need to store state from prior iterations.
960 // We can presently avoid backtracking for:
961 // * where the parens are at the end of the regular expression (last term in any of the
962 // alternatives of the main body disjunction).
963 // * where the parens are non-capturing, and quantified unbounded greedy (*).
964 // * where the parens do not contain any capturing subpatterns.
965 void checkForTerminalParentheses()
966 {
967 // This check is much too crude; should be just checking whether the candidate
968 // node contains nested capturing subpatterns, not the whole expression!
969 if (m_pattern.m_numSubpatterns)
970 return;
971
972 Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
973 for (size_t i = 0; i < alternatives.size(); ++i) {
974 Vector<PatternTerm>& terms = alternatives[i]->m_terms;
975 if (terms.size()) {
976 PatternTerm& term = terms.last();
977 if (term.type == PatternTerm::TypeParenthesesSubpattern
978 && term.quantityType == QuantifierGreedy
979 && term.quantityMinCount == 0
980 && term.quantityMaxCount == quantifyInfinite
981 && !term.capture())
982 term.parentheses.isTerminal = true;
983 }
984 }
985 }
986
987 void optimizeBOL()
988 {
989 // Look for expressions containing beginning of line (^) anchoring and unroll them.
990 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
991 // This code relies on the parsing code tagging alternatives with m_containsBOL and
992 // m_startsWithBOL and rolling those up to containing alternatives.
993 // At this point, this is only valid for non-multiline expressions.
994 PatternDisjunction* disjunction = m_pattern.m_body;
995
996 if (!m_pattern.m_containsBOL || m_pattern.multiline())
997 return;
998
999 PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
1000
1001 // Set alternatives in disjunction to "onceThrough"
1002 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
1003 disjunction->m_alternatives[alt]->setOnceThrough();
1004
1005 if (loopDisjunction) {
1006 // Move alternatives from loopDisjunction to disjunction
1007 for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
1008 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
1009
1010 loopDisjunction->m_alternatives.clear();
1011 }
1012 }
1013
1014 bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
1015 {
1016 Vector<PatternTerm>& terms = alternative->m_terms;
1017
1018 ASSERT(endIndex <= terms.size());
1019 for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
1020 PatternTerm& term = terms[termIndex];
1021
1022 if (term.m_capture)
1023 return true;
1024
1025 if (term.type == PatternTerm::TypeParenthesesSubpattern) {
1026 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
1027 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
1028 if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
1029 return true;
1030 }
1031 }
1032 }
1033
1034 return false;
1035 }
1036
1037 // This optimization identifies alternatives in the form of
1038 // [^].*[?]<expression>.*[$] for expressions that don't have any
1039 // capturing terms. The alternative is changed to <expression>
1040 // followed by processing of the dot stars to find and adjust the
1041 // beginning and the end of the match.
1042 void optimizeDotStarWrappedExpressions()
1043 {
1044 Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
1045 if (alternatives.size() != 1)
1046 return;
1047
1048 CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
1049 PatternAlternative* alternative = alternatives[0].get();
1050 Vector<PatternTerm>& terms = alternative->m_terms;
1051 if (terms.size() >= 3) {
1052 bool startsWithBOL = false;
1053 bool endsWithEOL = false;
1054 size_t termIndex, firstExpressionTerm;
1055
1056 termIndex = 0;
1057 if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
1058 startsWithBOL = true;
1059 ++termIndex;
1060 }
1061
1062 PatternTerm& firstNonAnchorTerm = terms[termIndex];
1063 if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1064 || firstNonAnchorTerm.characterClass != dotCharacterClass
1065 || firstNonAnchorTerm.quantityMinCount
1066 || firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1067 return;
1068
1069 firstExpressionTerm = termIndex + 1;
1070
1071 termIndex = terms.size() - 1;
1072 if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
1073 endsWithEOL = true;
1074 --termIndex;
1075 }
1076
1077 PatternTerm& lastNonAnchorTerm = terms[termIndex];
1078 if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1079 || lastNonAnchorTerm.characterClass != dotCharacterClass
1080 || lastNonAnchorTerm.quantityType != QuantifierGreedy
1081 || lastNonAnchorTerm.quantityMinCount
1082 || lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1083 return;
1084
1085 size_t endIndex = termIndex;
1086 if (firstExpressionTerm >= endIndex)
1087 return;
1088
1089 if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
1090 for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
1091 terms.remove(termIndex);
1092
1093 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
1094 terms.remove(termIndex - 1);
1095
1096 terms.append(PatternTerm(startsWithBOL, endsWithEOL));
1097
1098 m_pattern.m_containsBOL = false;
1099 }
1100 }
1101 }
1102
1103private:
1104 bool isSafeToRecurse() const
1105 {
1106 if (!m_stackLimit)
1107 return true;
1108 ASSERT(Thread::current().stack().isGrowingDownward());
1109 int8_t* curr = reinterpret_cast<int8_t*>(currentStackPointer());
1110 int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
1111 return curr >= limit;
1112 }
1113
1114 YarrPattern& m_pattern;
1115 PatternAlternative* m_alternative;
1116 CharacterClassConstructor m_characterClassConstructor;
1117 Vector<String> m_unmatchedNamedForwardReferences;
1118 void* m_stackLimit;
1119 bool m_invertCharacterClass;
1120 bool m_invertParentheticalAssertion { false };
1121};
1122
1123ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit)
1124{
1125 YarrPatternConstructor constructor(*this, stackLimit);
1126
1127 {
1128 ErrorCode error = parse(constructor, patternString, unicode());
1129 if (hasError(error))
1130 return error;
1131 }
1132
1133 // If the pattern contains illegal backreferences reset & reparse.
1134 // Quoting Netscape's "What's new in JavaScript 1.2",
1135 // "Note: if the number of left parentheses is less than the number specified
1136 // in \#, the \# is taken as an octal escape as described in the next row."
1137 if (containsIllegalBackReference() || containsIllegalNamedForwardReferences()) {
1138 if (unicode())
1139 return ErrorCode::InvalidBackreference;
1140
1141 unsigned numSubpatterns = m_numSubpatterns;
1142
1143 constructor.saveUnmatchedNamedForwardReferences();
1144 constructor.resetForReparsing();
1145 ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns);
1146 ASSERT_UNUSED(error, !hasError(error));
1147 ASSERT(numSubpatterns == m_numSubpatterns);
1148 }
1149
1150 constructor.checkForTerminalParentheses();
1151 constructor.optimizeDotStarWrappedExpressions();
1152 constructor.optimizeBOL();
1153
1154 {
1155 ErrorCode error = constructor.setupOffsets();
1156 if (hasError(error))
1157 return error;
1158 }
1159
1160 if (Options::dumpCompiledRegExpPatterns())
1161 dumpPattern(patternString);
1162
1163 return ErrorCode::NoError;
1164}
1165
1166YarrPattern::YarrPattern(const String& pattern, OptionSet<Flags> flags, ErrorCode& error, void* stackLimit)
1167 : m_containsBackreferences(false)
1168 , m_containsBOL(false)
1169 , m_containsUnsignedLengthPattern(false)
1170 , m_hasCopiedParenSubexpressions(false)
1171 , m_saveInitialStartValue(false)
1172 , m_flags(flags)
1173{
1174 ASSERT(m_flags != Flags::DeletedValue);
1175 error = compile(pattern, stackLimit);
1176}
1177
1178void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1179{
1180 out.print(" ");
1181 for (; nestingDepth; --nestingDepth)
1182 out.print(" ");
1183}
1184
1185void dumpUChar32(PrintStream& out, UChar32 c)
1186{
1187 if (c >= ' '&& c <= 0xff)
1188 out.printf("'%c'", static_cast<char>(c));
1189 else
1190 out.printf("0x%04x", c);
1191}
1192
1193void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
1194{
1195 if (characterClass == pattern->anyCharacterClass())
1196 out.print("<any character>");
1197 else if (characterClass == pattern->newlineCharacterClass())
1198 out.print("<newline>");
1199 else if (characterClass == pattern->digitsCharacterClass())
1200 out.print("<digits>");
1201 else if (characterClass == pattern->spacesCharacterClass())
1202 out.print("<whitespace>");
1203 else if (characterClass == pattern->wordcharCharacterClass())
1204 out.print("<word>");
1205 else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
1206 out.print("<unicode word ignore case>");
1207 else if (characterClass == pattern->nondigitsCharacterClass())
1208 out.print("<non-digits>");
1209 else if (characterClass == pattern->nonspacesCharacterClass())
1210 out.print("<non-whitespace>");
1211 else if (characterClass == pattern->nonwordcharCharacterClass())
1212 out.print("<non-word>");
1213 else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1214 out.print("<unicode non-word ignore case>");
1215 else {
1216 bool needMatchesRangesSeperator = false;
1217
1218 auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1219 size_t matchesSize = matches.size();
1220 if (matchesSize) {
1221 if (needMatchesRangesSeperator)
1222 out.print(",");
1223 needMatchesRangesSeperator = true;
1224
1225 out.print(prefix, ":(");
1226 for (size_t i = 0; i < matchesSize; ++i) {
1227 if (i)
1228 out.print(",");
1229 dumpUChar32(out, matches[i]);
1230 }
1231 out.print(")");
1232 }
1233 };
1234
1235 auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1236 size_t rangeSize = ranges.size();
1237 if (rangeSize) {
1238 if (needMatchesRangesSeperator)
1239 out.print(",");
1240 needMatchesRangesSeperator = true;
1241
1242 out.print(prefix, " ranges:(");
1243 for (size_t i = 0; i < rangeSize; ++i) {
1244 if (i)
1245 out.print(",");
1246 CharacterRange range = ranges[i];
1247 out.print("(");
1248 dumpUChar32(out, range.begin);
1249 out.print("..");
1250 dumpUChar32(out, range.end);
1251 out.print(")");
1252 }
1253 out.print(")");
1254 }
1255 };
1256
1257 out.print("[");
1258 dumpMatches("ASCII", characterClass->m_matches);
1259 dumpRanges("ASCII", characterClass->m_ranges);
1260 dumpMatches("Unicode", characterClass->m_matchesUnicode);
1261 dumpRanges("Unicode", characterClass->m_rangesUnicode);
1262 out.print("]");
1263 }
1264}
1265
1266void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1267{
1268 out.print("minimum size: ", m_minimumSize);
1269 if (m_hasFixedSize)
1270 out.print(",fixed size");
1271 if (m_onceThrough)
1272 out.print(",once through");
1273 if (m_startsWithBOL)
1274 out.print(",starts with ^");
1275 if (m_containsBOL)
1276 out.print(",contains ^");
1277 out.print("\n");
1278
1279 for (size_t i = 0; i < m_terms.size(); ++i)
1280 m_terms[i].dump(out, thisPattern, nestingDepth);
1281}
1282
1283void PatternTerm::dumpQuantifier(PrintStream& out)
1284{
1285 if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1)
1286 return;
1287 out.print(" {", quantityMinCount.unsafeGet());
1288 if (quantityMinCount != quantityMaxCount) {
1289 if (quantityMaxCount == UINT_MAX)
1290 out.print(",...");
1291 else
1292 out.print(",", quantityMaxCount.unsafeGet());
1293 }
1294 out.print("}");
1295 if (quantityType == QuantifierGreedy)
1296 out.print(" greedy");
1297 else if (quantityType == QuantifierNonGreedy)
1298 out.print(" non-greedy");
1299}
1300
1301void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1302{
1303 indentForNestingLevel(out, nestingDepth);
1304
1305 if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
1306 if (invert())
1307 out.print("not ");
1308 }
1309
1310 switch (type) {
1311 case TypeAssertionBOL:
1312 out.println("BOL");
1313 break;
1314 case TypeAssertionEOL:
1315 out.println("EOL");
1316 break;
1317 case TypeAssertionWordBoundary:
1318 out.println("word boundary");
1319 break;
1320 case TypePatternCharacter:
1321 out.printf("character ");
1322 out.printf("inputPosition %u ", inputPosition);
1323 if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1324 dumpUChar32(out, toASCIIUpper(patternCharacter));
1325 out.print("/");
1326 dumpUChar32(out, toASCIILower(patternCharacter));
1327 } else
1328 dumpUChar32(out, patternCharacter);
1329 dumpQuantifier(out);
1330 if (quantityType != QuantifierFixedCount)
1331 out.print(",frame location ", frameLocation);
1332 out.println();
1333 break;
1334 case TypeCharacterClass:
1335 out.print("character class ");
1336 out.printf("inputPosition %u ", inputPosition);
1337 dumpCharacterClass(out, thisPattern, characterClass);
1338 dumpQuantifier(out);
1339 if (quantityType != QuantifierFixedCount || thisPattern->unicode())
1340 out.print(",frame location ", frameLocation);
1341 out.println();
1342 break;
1343 case TypeBackReference:
1344 out.print("back reference to subpattern #", backReferenceSubpatternId);
1345 out.println(",frame location ", frameLocation);
1346 break;
1347 case TypeForwardReference:
1348 out.println("forward reference");
1349 break;
1350 case TypeParenthesesSubpattern:
1351 if (m_capture)
1352 out.print("captured ");
1353 else
1354 out.print("non-captured ");
1355
1356 FALLTHROUGH;
1357 case TypeParentheticalAssertion:
1358 if (m_invert)
1359 out.print("inverted ");
1360
1361 if (type == TypeParenthesesSubpattern)
1362 out.print("subpattern");
1363 else if (type == TypeParentheticalAssertion)
1364 out.print("assertion");
1365
1366 if (m_capture)
1367 out.print(" #", parentheses.subpatternId);
1368
1369 dumpQuantifier(out);
1370
1371 if (parentheses.isCopy)
1372 out.print(",copy");
1373
1374 if (parentheses.isTerminal)
1375 out.print(",terminal");
1376
1377 out.println(",frame location ", frameLocation);
1378
1379 if (parentheses.disjunction->m_alternatives.size() > 1) {
1380 indentForNestingLevel(out, nestingDepth + 1);
1381 unsigned alternativeFrameLocation = frameLocation;
1382 if (quantityMaxCount == 1 && !parentheses.isCopy)
1383 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1384 else if (parentheses.isTerminal)
1385 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1386 else
1387 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
1388 out.println("alternative list,frame location ", alternativeFrameLocation);
1389 }
1390
1391 parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1);
1392 break;
1393 case TypeDotStarEnclosure:
1394 out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1395 break;
1396 }
1397}
1398
1399void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0)
1400{
1401 unsigned alternativeCount = m_alternatives.size();
1402 for (unsigned i = 0; i < alternativeCount; ++i) {
1403 indentForNestingLevel(out, nestingDepth);
1404 if (alternativeCount > 1)
1405 out.print("alternative #", i, ": ");
1406 m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1));
1407 }
1408}
1409
1410void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString)
1411{
1412 out.print("/", patternString, "/");
1413
1414 if (global())
1415 out.print("g");
1416 if (ignoreCase())
1417 out.print("i");
1418 if (multiline())
1419 out.print("m");
1420 if (unicode())
1421 out.print("u");
1422 if (sticky())
1423 out.print("y");
1424}
1425
1426void YarrPattern::dumpPattern(const String& patternString)
1427{
1428 dumpPattern(WTF::dataFile(), patternString);
1429}
1430
1431void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1432{
1433 out.print("RegExp pattern for ");
1434 dumpPatternString(out, patternString);
1435
1436 if (m_flags) {
1437 bool printSeperator = false;
1438 out.print(" (");
1439 if (global()) {
1440 out.print("global");
1441 printSeperator = true;
1442 }
1443 if (ignoreCase()) {
1444 if (printSeperator)
1445 out.print("|");
1446 out.print("ignore case");
1447 printSeperator = true;
1448 }
1449 if (multiline()) {
1450 if (printSeperator)
1451 out.print("|");
1452 out.print("multiline");
1453 printSeperator = true;
1454 }
1455 if (unicode()) {
1456 if (printSeperator)
1457 out.print("|");
1458 out.print("unicode");
1459 printSeperator = true;
1460 }
1461 if (sticky()) {
1462 if (printSeperator)
1463 out.print("|");
1464 out.print("sticky");
1465 printSeperator = true;
1466 }
1467 out.print(")");
1468 }
1469 out.print(":\n");
1470 if (m_body->m_callFrameSize)
1471 out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
1472 m_body->dump(out, this);
1473}
1474
1475std::unique_ptr<CharacterClass> anycharCreate()
1476{
1477 auto characterClass = std::make_unique<CharacterClass>();
1478 characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
1479 characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
1480 characterClass->m_characterWidths = CharacterClassWidths::HasBothBMPAndNonBMP;
1481 characterClass->m_anyCharacter = true;
1482 return characterClass;
1483}
1484
1485} } // namespace JSC::Yarr
1486