1/*
2 * Copyright (C) 2009-2019 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#pragma once
27
28#include "Yarr.h"
29#include "YarrPattern.h"
30#include "YarrUnicodeProperties.h"
31#include <wtf/ASCIICType.h>
32#include <wtf/HashSet.h>
33#include <wtf/Optional.h>
34#include <wtf/text/StringBuilder.h>
35#include <wtf/text/WTFString.h>
36
37namespace JSC { namespace Yarr {
38
39// The Parser class should not be used directly - only via the Yarr::parse() method.
40template<class Delegate, typename CharType>
41class Parser {
42private:
43 template<class FriendDelegate>
44 friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit);
45
46 /*
47 * CharacterClassParserDelegate:
48 *
49 * The class CharacterClassParserDelegate is used in the parsing of character
50 * classes. This class handles detection of character ranges. This class
51 * implements enough of the delegate interface such that it can be passed to
52 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
53 * to perform the parsing of escape characters in character sets.
54 */
55 class CharacterClassParserDelegate {
56 public:
57 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
58 : m_delegate(delegate)
59 , m_errorCode(err)
60 , m_state(Empty)
61 , m_character(0)
62 {
63 }
64
65 /*
66 * begin():
67 *
68 * Called at beginning of construction.
69 */
70 void begin(bool invert)
71 {
72 m_delegate.atomCharacterClassBegin(invert);
73 }
74
75 /*
76 * atomPatternCharacter():
77 *
78 * This method is called either from parseCharacterClass() (for an unescaped
79 * character in a character class), or from parseEscape(). In the former case
80 * the value true will be passed for the argument 'hyphenIsRange', and in this
81 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
82 * is different to /[a\-z]/).
83 */
84 void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false)
85 {
86 switch (m_state) {
87 case AfterCharacterClass:
88 // Following a builtin character class we need look out for a hyphen.
89 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
90 // If we see a hyphen following a charater class then unlike usual
91 // we'll report it to the delegate immediately, and put ourself into
92 // a poisoned state. Any following calls to add another character or
93 // character class will result in an error. (A hypen following a
94 // character-class is itself valid, but only at the end of a regex).
95 if (hyphenIsRange && ch == '-') {
96 m_delegate.atomCharacterClassAtom('-');
97 m_state = AfterCharacterClassHyphen;
98 return;
99 }
100 // Otherwise just fall through - cached character so treat this as Empty.
101 FALLTHROUGH;
102
103 case Empty:
104 m_character = ch;
105 m_state = CachedCharacter;
106 return;
107
108 case CachedCharacter:
109 if (hyphenIsRange && ch == '-')
110 m_state = CachedCharacterHyphen;
111 else {
112 m_delegate.atomCharacterClassAtom(m_character);
113 m_character = ch;
114 }
115 return;
116
117 case CachedCharacterHyphen:
118 if (ch < m_character) {
119 m_errorCode = ErrorCode::CharacterClassOutOfOrder;
120 return;
121 }
122 m_delegate.atomCharacterClassRange(m_character, ch);
123 m_state = Empty;
124 return;
125
126 // See coment in atomBuiltInCharacterClass below.
127 // This too is technically an error, per ECMA-262, and again we
128 // we chose to allow this. Note a subtlely here that while we
129 // diverge from the spec's definition of CharacterRange we do
130 // remain in compliance with the grammar. For example, consider
131 // the expression /[\d-a-z]/. We comply with the grammar in
132 // this case by not allowing a-z to be matched as a range.
133 case AfterCharacterClassHyphen:
134 m_delegate.atomCharacterClassAtom(ch);
135 m_state = Empty;
136 return;
137 }
138 }
139
140 /*
141 * atomBuiltInCharacterClass():
142 *
143 * Adds a built-in character class, called by parseEscape().
144 */
145 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
146 {
147 switch (m_state) {
148 case CachedCharacter:
149 // Flush the currently cached character, then fall through.
150 m_delegate.atomCharacterClassAtom(m_character);
151 FALLTHROUGH;
152 case Empty:
153 case AfterCharacterClass:
154 m_state = AfterCharacterClass;
155 m_delegate.atomCharacterClassBuiltIn(classID, invert);
156 return;
157
158 // If we hit either of these cases, we have an invalid range that
159 // looks something like /[x-\d]/ or /[\d-\d]/.
160 // According to ECMA-262 this should be a syntax error, but
161 // empirical testing shows this to break teh webz. Instead we
162 // comply with to the ECMA-262 grammar, and assume the grammar to
163 // have matched the range correctly, but tweak our interpretation
164 // of CharacterRange. Effectively we implicitly handle the hyphen
165 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
166 case CachedCharacterHyphen:
167 m_delegate.atomCharacterClassAtom(m_character);
168 m_delegate.atomCharacterClassAtom('-');
169 FALLTHROUGH;
170 case AfterCharacterClassHyphen:
171 m_delegate.atomCharacterClassBuiltIn(classID, invert);
172 m_state = Empty;
173 return;
174 }
175 }
176
177 /*
178 * end():
179 *
180 * Called at end of construction.
181 */
182 void end()
183 {
184 if (m_state == CachedCharacter)
185 m_delegate.atomCharacterClassAtom(m_character);
186 else if (m_state == CachedCharacterHyphen) {
187 m_delegate.atomCharacterClassAtom(m_character);
188 m_delegate.atomCharacterClassAtom('-');
189 }
190 m_delegate.atomCharacterClassEnd();
191 }
192
193 // parseEscape() should never call these delegate methods when
194 // invoked with inCharacterClass set.
195 NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
196 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
197 NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
198 NO_RETURN_DUE_TO_ASSERT bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
199 NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
200
201 private:
202 Delegate& m_delegate;
203 ErrorCode& m_errorCode;
204 enum CharacterClassConstructionState {
205 Empty,
206 CachedCharacter,
207 CachedCharacterHyphen,
208 AfterCharacterClass,
209 AfterCharacterClassHyphen,
210 } m_state;
211 UChar32 m_character;
212 };
213
214 Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit)
215 : m_delegate(delegate)
216 , m_backReferenceLimit(backReferenceLimit)
217 , m_data(pattern.characters<CharType>())
218 , m_size(pattern.length())
219 , m_isUnicode(isUnicode)
220 {
221 }
222
223 // The handling of IdentityEscapes is different depending on the unicode flag.
224 // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
225 // For non-unicode patterns, most any character can be escaped.
226 bool isIdentityEscapeAnError(int ch)
227 {
228 if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) {
229 m_errorCode = ErrorCode::InvalidIdentityEscape;
230 return true;
231 }
232
233 return false;
234 }
235
236 /*
237 * parseEscape():
238 *
239 * Helper for parseTokens() AND parseCharacterClass().
240 * Unlike the other parser methods, this function does not report tokens
241 * directly to the member delegate (m_delegate), instead tokens are
242 * emitted to the delegate provided as an argument. In the case of atom
243 * escapes, parseTokens() will call parseEscape() passing m_delegate as
244 * an argument, and as such the escape will be reported to the delegate.
245 *
246 * However this method may also be used by parseCharacterClass(), in which
247 * case a CharacterClassParserDelegate will be passed as the delegate that
248 * tokens should be added to. A boolean flag is also provided to indicate
249 * whether that an escape in a CharacterClass is being parsed (some parsing
250 * rules change in this context).
251 *
252 * The boolean value returned by this method indicates whether the token
253 * parsed was an atom (outside of a characted class \b and \B will be
254 * interpreted as assertions).
255 */
256 template<bool inCharacterClass, class EscapeDelegate>
257 bool parseEscape(EscapeDelegate& delegate)
258 {
259 ASSERT(!hasError(m_errorCode));
260 ASSERT(peek() == '\\');
261 consume();
262
263 if (atEndOfPattern()) {
264 m_errorCode = ErrorCode::EscapeUnterminated;
265 return false;
266 }
267
268 switch (peek()) {
269 // Assertions
270 case 'b':
271 consume();
272 if (inCharacterClass) {
273 if (isIdentityEscapeAnError('b'))
274 break;
275
276 delegate.atomPatternCharacter('\b');
277 } else {
278 delegate.assertionWordBoundary(false);
279 return false;
280 }
281 break;
282 case 'B':
283 consume();
284 if (inCharacterClass) {
285 if (isIdentityEscapeAnError('B'))
286 break;
287
288 delegate.atomPatternCharacter('B');
289 } else {
290 delegate.assertionWordBoundary(true);
291 return false;
292 }
293 break;
294
295 // CharacterClassEscape
296 case 'd':
297 consume();
298 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false);
299 break;
300 case 's':
301 consume();
302 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false);
303 break;
304 case 'w':
305 consume();
306 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false);
307 break;
308 case 'D':
309 consume();
310 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true);
311 break;
312 case 'S':
313 consume();
314 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true);
315 break;
316 case 'W':
317 consume();
318 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true);
319 break;
320
321 // DecimalEscape
322 case '1':
323 case '2':
324 case '3':
325 case '4':
326 case '5':
327 case '6':
328 case '7':
329 case '8':
330 case '9': {
331 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332 // First, try to parse this as backreference.
333 if (!inCharacterClass) {
334 ParseState state = saveState();
335
336 unsigned backReference = consumeNumber();
337 if (backReference <= m_backReferenceLimit) {
338 delegate.atomBackReference(backReference);
339 break;
340 }
341
342 restoreState(state);
343
344 if (m_isUnicode) {
345 m_errorCode = ErrorCode::InvalidBackreference;
346 return false;
347 }
348 }
349
350 // Not a backreference, and not octal. Just a number.
351 if (peek() >= '8') {
352 delegate.atomPatternCharacter(consume());
353 break;
354 }
355
356 // Fall-through to handle this as an octal escape.
357 FALLTHROUGH;
358 }
359
360 // Octal escape
361 case '0':
362 delegate.atomPatternCharacter(consumeOctal());
363 break;
364
365 // ControlEscape
366 case 'f':
367 consume();
368 delegate.atomPatternCharacter('\f');
369 break;
370 case 'n':
371 consume();
372 delegate.atomPatternCharacter('\n');
373 break;
374 case 'r':
375 consume();
376 delegate.atomPatternCharacter('\r');
377 break;
378 case 't':
379 consume();
380 delegate.atomPatternCharacter('\t');
381 break;
382 case 'v':
383 consume();
384 delegate.atomPatternCharacter('\v');
385 break;
386
387 // ControlLetter
388 case 'c': {
389 ParseState state = saveState();
390 consume();
391 if (!atEndOfPattern()) {
392 int control = consume();
393
394 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
395 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
396 delegate.atomPatternCharacter(control & 0x1f);
397 break;
398 }
399 }
400 restoreState(state);
401 delegate.atomPatternCharacter('\\');
402 break;
403 }
404
405 // HexEscape
406 case 'x': {
407 consume();
408 int x = tryConsumeHex(2);
409 if (x == -1) {
410 if (isIdentityEscapeAnError('x'))
411 break;
412
413 delegate.atomPatternCharacter('x');
414 } else
415 delegate.atomPatternCharacter(x);
416 break;
417 }
418
419 // Named backreference
420 case 'k': {
421 consume();
422 ParseState state = saveState();
423 if (!atEndOfPattern() && !inCharacterClass) {
424 if (consume() == '<') {
425 auto groupName = tryConsumeGroupName();
426 if (groupName) {
427 if (m_captureGroupNames.contains(groupName.value())) {
428 delegate.atomNamedBackReference(groupName.value());
429 break;
430 }
431
432 if (delegate.isValidNamedForwardReference(groupName.value())) {
433 delegate.atomNamedForwardReference(groupName.value());
434 break;
435 }
436 }
437 if (m_isUnicode) {
438 m_errorCode = ErrorCode::InvalidBackreference;
439 break;
440 }
441 }
442 }
443 restoreState(state);
444 delegate.atomPatternCharacter('k');
445 break;
446 }
447
448 // Unicode property escapes
449 case 'p':
450 case 'P': {
451 int escapeChar = consume();
452
453 if (!m_isUnicode) {
454 if (isIdentityEscapeAnError(escapeChar))
455 break;
456 delegate.atomPatternCharacter(escapeChar);
457 break;
458 }
459
460 if (!atEndOfPattern() && peek() == '{') {
461 consume();
462 auto optClassID = tryConsumeUnicodePropertyExpression();
463 if (!optClassID) {
464 // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression
465 break;
466 }
467 delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P');
468 } else
469 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
470 break;
471 }
472
473 // UnicodeEscape
474 case 'u': {
475 consume();
476 if (atEndOfPattern()) {
477 if (isIdentityEscapeAnError('u'))
478 break;
479
480 delegate.atomPatternCharacter('u');
481 break;
482 }
483
484 if (m_isUnicode && peek() == '{') {
485 consume();
486 UChar32 codePoint = 0;
487 do {
488 if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
489 m_errorCode = ErrorCode::InvalidUnicodeEscape;
490 break;
491 }
492
493 codePoint = (codePoint << 4) | toASCIIHexValue(consume());
494
495 if (codePoint > UCHAR_MAX_VALUE)
496 m_errorCode = ErrorCode::InvalidUnicodeEscape;
497 } while (!atEndOfPattern() && peek() != '}');
498 if (!atEndOfPattern() && peek() == '}')
499 consume();
500 else if (!hasError(m_errorCode))
501 m_errorCode = ErrorCode::InvalidUnicodeEscape;
502 if (hasError(m_errorCode))
503 return false;
504
505 delegate.atomPatternCharacter(codePoint);
506 break;
507 }
508 int u = tryConsumeHex(4);
509 if (u == -1) {
510 if (isIdentityEscapeAnError('u'))
511 break;
512
513 delegate.atomPatternCharacter('u');
514 } else {
515 // If we have the first of a surrogate pair, look for the second.
516 if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
517 ParseState state = saveState();
518 consume();
519
520 if (tryConsume('u')) {
521 int surrogate2 = tryConsumeHex(4);
522 if (U16_IS_TRAIL(surrogate2)) {
523 u = U16_GET_SUPPLEMENTARY(u, surrogate2);
524 delegate.atomPatternCharacter(u);
525 break;
526 }
527 }
528
529 restoreState(state);
530 }
531 delegate.atomPatternCharacter(u);
532 }
533 break;
534 }
535
536 // IdentityEscape
537 default:
538 int ch = peek();
539
540 if (ch == '-' && m_isUnicode && inCharacterClass) {
541 // \- is allowed for ClassEscape with unicode flag.
542 delegate.atomPatternCharacter(consume());
543 break;
544 }
545
546 if (isIdentityEscapeAnError(ch))
547 break;
548
549 delegate.atomPatternCharacter(consume());
550 }
551
552 return true;
553 }
554
555 UChar32 consumePossibleSurrogatePair()
556 {
557 UChar32 ch = consume();
558 if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) {
559 ParseState state = saveState();
560
561 UChar32 surrogate2 = consume();
562 if (U16_IS_TRAIL(surrogate2))
563 ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
564 else
565 restoreState(state);
566 }
567
568 return ch;
569 }
570
571 /*
572 * parseAtomEscape(), parseCharacterClassEscape():
573 *
574 * These methods alias to parseEscape().
575 */
576 bool parseAtomEscape()
577 {
578 return parseEscape<false>(m_delegate);
579 }
580 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
581 {
582 parseEscape<true>(delegate);
583 }
584
585 /*
586 * parseCharacterClass():
587 *
588 * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape)
589 * to an instance of CharacterClassParserDelegate, to describe the character class to the
590 * delegate.
591 */
592 void parseCharacterClass()
593 {
594 ASSERT(!hasError(m_errorCode));
595 ASSERT(peek() == '[');
596 consume();
597
598 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode);
599
600 characterClassConstructor.begin(tryConsume('^'));
601
602 while (!atEndOfPattern()) {
603 switch (peek()) {
604 case ']':
605 consume();
606 characterClassConstructor.end();
607 return;
608
609 case '\\':
610 parseCharacterClassEscape(characterClassConstructor);
611 break;
612
613 default:
614 characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true);
615 }
616
617 if (hasError(m_errorCode))
618 return;
619 }
620
621 m_errorCode = ErrorCode::CharacterClassUnmatched;
622 }
623
624 /*
625 * parseParenthesesBegin():
626 *
627 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
628 */
629 void parseParenthesesBegin()
630 {
631 ASSERT(!hasError(m_errorCode));
632 ASSERT(peek() == '(');
633 consume();
634
635 if (tryConsume('?')) {
636 if (atEndOfPattern()) {
637 m_errorCode = ErrorCode::ParenthesesTypeInvalid;
638 return;
639 }
640
641 switch (consume()) {
642 case ':':
643 m_delegate.atomParenthesesSubpatternBegin(false);
644 break;
645
646 case '=':
647 m_delegate.atomParentheticalAssertionBegin();
648 break;
649
650 case '!':
651 m_delegate.atomParentheticalAssertionBegin(true);
652 break;
653
654 case '<': {
655 auto groupName = tryConsumeGroupName();
656 if (groupName) {
657 auto setAddResult = m_captureGroupNames.add(groupName.value());
658 if (setAddResult.isNewEntry)
659 m_delegate.atomParenthesesSubpatternBegin(true, groupName);
660 else
661 m_errorCode = ErrorCode::DuplicateGroupName;
662 } else
663 m_errorCode = ErrorCode::InvalidGroupName;
664
665 break;
666 }
667
668 default:
669 m_errorCode = ErrorCode::ParenthesesTypeInvalid;
670 }
671 } else
672 m_delegate.atomParenthesesSubpatternBegin();
673
674 ++m_parenthesesNestingDepth;
675 }
676
677 /*
678 * parseParenthesesEnd():
679 *
680 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
681 */
682 void parseParenthesesEnd()
683 {
684 ASSERT(!hasError(m_errorCode));
685 ASSERT(peek() == ')');
686 consume();
687
688 if (m_parenthesesNestingDepth > 0)
689 m_delegate.atomParenthesesEnd();
690 else
691 m_errorCode = ErrorCode::ParenthesesUnmatched;
692
693 --m_parenthesesNestingDepth;
694 }
695
696 /*
697 * parseQuantifier():
698 *
699 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
700 */
701 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
702 {
703 ASSERT(!hasError(m_errorCode));
704 ASSERT(min <= max);
705
706 if (min == UINT_MAX) {
707 m_errorCode = ErrorCode::QuantifierTooLarge;
708 return;
709 }
710
711 if (lastTokenWasAnAtom)
712 m_delegate.quantifyAtom(min, max, !tryConsume('?'));
713 else
714 m_errorCode = ErrorCode::QuantifierWithoutAtom;
715 }
716
717 /*
718 * parseTokens():
719 *
720 * This method loops over the input pattern reporting tokens to the delegate.
721 * The method returns when a parse error is detected, or the end of the pattern
722 * is reached. One piece of state is tracked around the loop, which is whether
723 * the last token passed to the delegate was an atom (this is necessary to detect
724 * a parse error when a quantifier provided without an atom to quantify).
725 */
726 void parseTokens()
727 {
728 bool lastTokenWasAnAtom = false;
729
730 while (!atEndOfPattern()) {
731 switch (peek()) {
732 case '|':
733 consume();
734 m_delegate.disjunction();
735 lastTokenWasAnAtom = false;
736 break;
737
738 case '(':
739 parseParenthesesBegin();
740 lastTokenWasAnAtom = false;
741 break;
742
743 case ')':
744 parseParenthesesEnd();
745 lastTokenWasAnAtom = true;
746 break;
747
748 case '^':
749 consume();
750 m_delegate.assertionBOL();
751 lastTokenWasAnAtom = false;
752 break;
753
754 case '$':
755 consume();
756 m_delegate.assertionEOL();
757 lastTokenWasAnAtom = false;
758 break;
759
760 case '.':
761 consume();
762 m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false);
763 lastTokenWasAnAtom = true;
764 break;
765
766 case '[':
767 parseCharacterClass();
768 lastTokenWasAnAtom = true;
769 break;
770
771 case '\\':
772 lastTokenWasAnAtom = parseAtomEscape();
773 break;
774
775 case '*':
776 consume();
777 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
778 lastTokenWasAnAtom = false;
779 break;
780
781 case '+':
782 consume();
783 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
784 lastTokenWasAnAtom = false;
785 break;
786
787 case '?':
788 consume();
789 parseQuantifier(lastTokenWasAnAtom, 0, 1);
790 lastTokenWasAnAtom = false;
791 break;
792
793 case '{': {
794 ParseState state = saveState();
795
796 consume();
797 if (peekIsDigit()) {
798 unsigned min = consumeNumber();
799 unsigned max = min;
800
801 if (tryConsume(','))
802 max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
803
804 if (tryConsume('}')) {
805 if (min <= max)
806 parseQuantifier(lastTokenWasAnAtom, min, max);
807 else
808 m_errorCode = ErrorCode::QuantifierOutOfOrder;
809 lastTokenWasAnAtom = false;
810 break;
811 }
812 }
813
814 restoreState(state);
815 }
816 // if we did not find a complete quantifer, fall through to the default case.
817 FALLTHROUGH;
818
819 default:
820 m_delegate.atomPatternCharacter(consumePossibleSurrogatePair());
821 lastTokenWasAnAtom = true;
822 }
823
824 if (hasError(m_errorCode))
825 return;
826 }
827
828 if (m_parenthesesNestingDepth > 0)
829 m_errorCode = ErrorCode::MissingParentheses;
830 }
831
832 /*
833 * parse():
834 *
835 * This method calls parseTokens() to parse over the input and returns error code for a result.
836 */
837 ErrorCode parse()
838 {
839 if (m_size > MAX_PATTERN_SIZE)
840 m_errorCode = ErrorCode::PatternTooLarge;
841 else
842 parseTokens();
843 ASSERT(atEndOfPattern() || hasError(m_errorCode));
844
845 return m_errorCode;
846 }
847
848 // Misc helper functions:
849
850 typedef unsigned ParseState;
851
852 ParseState saveState()
853 {
854 return m_index;
855 }
856
857 void restoreState(ParseState state)
858 {
859 m_index = state;
860 }
861
862 bool atEndOfPattern()
863 {
864 ASSERT(m_index <= m_size);
865 return m_index == m_size;
866 }
867
868 unsigned patternRemaining()
869 {
870 ASSERT(m_index <= m_size);
871 return m_size - m_index;
872 }
873
874 int peek()
875 {
876 ASSERT(m_index < m_size);
877 return m_data[m_index];
878 }
879
880 bool peekIsDigit()
881 {
882 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
883 }
884
885 unsigned peekDigit()
886 {
887 ASSERT(peekIsDigit());
888 return peek() - '0';
889 }
890
891 int tryConsumeUnicodeEscape()
892 {
893 if (!tryConsume('u'))
894 return -1;
895
896 if (m_isUnicode && tryConsume('{')) {
897 int codePoint = 0;
898 do {
899 if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
900 m_errorCode = ErrorCode::InvalidUnicodeEscape;
901 return -1;
902 }
903
904 codePoint = (codePoint << 4) | toASCIIHexValue(consume());
905
906 if (codePoint > UCHAR_MAX_VALUE) {
907 m_errorCode = ErrorCode::InvalidUnicodeEscape;
908 return -1;
909 }
910 } while (!atEndOfPattern() && peek() != '}');
911 if (!atEndOfPattern() && peek() == '}')
912 consume();
913 else if (!hasError(m_errorCode))
914 m_errorCode = ErrorCode::InvalidUnicodeEscape;
915 if (hasError(m_errorCode))
916 return -1;
917
918 return codePoint;
919 }
920
921 int u = tryConsumeHex(4);
922 if (u == -1)
923 return -1;
924
925 // If we have the first of a surrogate pair, look for the second.
926 if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
927 ParseState state = saveState();
928 consume();
929
930 if (tryConsume('u')) {
931 int surrogate2 = tryConsumeHex(4);
932 if (U16_IS_TRAIL(surrogate2)) {
933 u = U16_GET_SUPPLEMENTARY(u, surrogate2);
934 return u;
935 }
936 }
937
938 restoreState(state);
939 }
940
941 return u;
942 }
943
944 int tryConsumeIdentifierCharacter()
945 {
946 int ch = peek();
947
948 if (ch == '\\') {
949 consume();
950 ch = tryConsumeUnicodeEscape();
951 } else
952 consume();
953
954 return ch;
955 }
956
957 bool isIdentifierStart(int ch)
958 {
959 return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK);
960 }
961
962 bool isIdentifierPart(int ch)
963 {
964 return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D;
965 }
966
967 bool isUnicodePropertyValueExpressionChar(int ch)
968 {
969 return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '=';
970 }
971
972 int consume()
973 {
974 ASSERT(m_index < m_size);
975 return m_data[m_index++];
976 }
977
978 unsigned consumeDigit()
979 {
980 ASSERT(peekIsDigit());
981 return consume() - '0';
982 }
983
984 unsigned consumeNumber()
985 {
986 Checked<unsigned, RecordOverflow> n = consumeDigit();
987 while (peekIsDigit())
988 n = n * 10 + consumeDigit();
989 return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet();
990 }
991
992 unsigned consumeOctal()
993 {
994 ASSERT(WTF::isASCIIOctalDigit(peek()));
995
996 unsigned n = consumeDigit();
997 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
998 n = n * 8 + consumeDigit();
999 return n;
1000 }
1001
1002 bool tryConsume(UChar ch)
1003 {
1004 if (atEndOfPattern() || (m_data[m_index] != ch))
1005 return false;
1006 ++m_index;
1007 return true;
1008 }
1009
1010 int tryConsumeHex(int count)
1011 {
1012 ParseState state = saveState();
1013
1014 int n = 0;
1015 while (count--) {
1016 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
1017 restoreState(state);
1018 return -1;
1019 }
1020 n = (n << 4) | WTF::toASCIIHexValue(consume());
1021 }
1022 return n;
1023 }
1024
1025 Optional<String> tryConsumeGroupName()
1026 {
1027 if (atEndOfPattern())
1028 return WTF::nullopt;
1029
1030 ParseState state = saveState();
1031
1032 int ch = tryConsumeIdentifierCharacter();
1033
1034 if (isIdentifierStart(ch)) {
1035 StringBuilder identifierBuilder;
1036 identifierBuilder.appendCharacter(ch);
1037
1038 while (!atEndOfPattern()) {
1039 ch = tryConsumeIdentifierCharacter();
1040 if (ch == '>')
1041 return Optional<String>(identifierBuilder.toString());
1042
1043 if (!isIdentifierPart(ch))
1044 break;
1045
1046 identifierBuilder.appendCharacter(ch);
1047 }
1048 }
1049
1050 restoreState(state);
1051
1052 return WTF::nullopt;
1053 }
1054
1055 Optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression()
1056 {
1057 if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) {
1058 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1059 return WTF::nullopt;
1060 }
1061
1062 StringBuilder expressionBuilder;
1063 String unicodePropertyName;
1064 bool foundEquals = false;
1065 unsigned errors = 0;
1066
1067 expressionBuilder.appendCharacter(consume());
1068
1069 while (!atEndOfPattern()) {
1070 int ch = peek();
1071 if (ch == '}') {
1072 consume();
1073 if (errors) {
1074 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1075 return WTF::nullopt;
1076 }
1077
1078 if (foundEquals) {
1079 auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString());
1080 if (!result)
1081 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1082 return result;
1083 }
1084
1085 auto result = unicodeMatchProperty(expressionBuilder.toString());
1086 if (!result)
1087 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1088 return result;
1089 }
1090
1091 consume();
1092 if (ch == '=') {
1093 if (!foundEquals) {
1094 foundEquals = true;
1095 unicodePropertyName = expressionBuilder.toString();
1096 expressionBuilder.clear();
1097 } else
1098 errors++;
1099 } else if (!isUnicodePropertyValueExpressionChar(ch))
1100 errors++;
1101 else
1102 expressionBuilder.appendCharacter(ch);
1103 }
1104
1105 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1106 return WTF::nullopt;
1107 }
1108
1109 Delegate& m_delegate;
1110 unsigned m_backReferenceLimit;
1111 ErrorCode m_errorCode { ErrorCode::NoError };
1112 const CharType* m_data;
1113 unsigned m_size;
1114 unsigned m_index { 0 };
1115 bool m_isUnicode;
1116 unsigned m_parenthesesNestingDepth { 0 };
1117 HashSet<String> m_captureGroupNames;
1118
1119 // Derived by empirical testing of compile time in PCRE and WREC.
1120 static constexpr unsigned MAX_PATTERN_SIZE = 1024 * 1024;
1121};
1122
1123/*
1124 * Yarr::parse():
1125 *
1126 * The parse method is passed a pattern to be parsed and a delegate upon which
1127 * callbacks will be made to record the parsed tokens forming the regex.
1128 * Yarr::parse() returns null on success, or a const C string providing an error
1129 * message where a parse error occurs.
1130 *
1131 * The Delegate must implement the following interface:
1132 *
1133 * void assertionBOL();
1134 * void assertionEOL();
1135 * void assertionWordBoundary(bool invert);
1136 *
1137 * void atomPatternCharacter(UChar32 ch);
1138 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
1139 * void atomCharacterClassBegin(bool invert)
1140 * void atomCharacterClassAtom(UChar32 ch)
1141 * void atomCharacterClassRange(UChar32 begin, UChar32 end)
1142 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
1143 * void atomCharacterClassEnd()
1144 * void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName);
1145 * void atomParentheticalAssertionBegin(bool invert = false);
1146 * void atomParenthesesEnd();
1147 * void atomBackReference(unsigned subpatternId);
1148 * void atomNamedBackReference(const String& subpatternName);
1149 * bool isValidNamedForwardReference(const String& subpatternName);
1150 * void atomNamedForwardReference(const String& subpatternName);
1151 *
1152 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
1153 *
1154 * void disjunction();
1155 *
1156 * The regular expression is described by a sequence of assertion*() and atom*()
1157 * callbacks to the delegate, describing the terms in the regular expression.
1158 * Following an atom a quantifyAtom() call may occur to indicate that the previous
1159 * atom should be quantified. In the case of atoms described across multiple
1160 * calls (parentheses and character classes) the call to quantifyAtom() will come
1161 * after the call to the atom*End() method, never after atom*Begin().
1162 *
1163 * Character classes may either be described by a single call to
1164 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
1165 * In the latter case, ...Begin() will be called, followed by a sequence of
1166 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
1167 *
1168 * Sequences of atoms and assertions are broken into alternatives via calls to
1169 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
1170 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
1171 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
1172 * capturing subpattern, this will be the subpatternId associated with these
1173 * parentheses, and will also by definition be the lowest subpatternId of these
1174 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
1175 * is passed the subpatternId of the last capturing subexpression nested within
1176 * these paretheses. In the case of a capturing subpattern with no nested
1177 * capturing subpatterns, the same subpatternId will be passed to the begin and
1178 * end functions. In the case of non-capturing subpatterns the subpatternId
1179 * passed to the begin method is also the first possible subpatternId that might
1180 * be nested within these paretheses. If a set of non-capturing parentheses does
1181 * not contain any capturing subpatterns, then the subpatternId passed to begin
1182 * will be greater than the subpatternId passed to end.
1183 */
1184
1185template<class Delegate>
1186ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite)
1187{
1188 if (pattern.is8Bit())
1189 return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1190 return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1191}
1192
1193} } // namespace JSC::Yarr
1194