YarrParser.h source code [jsc/Source/JavaScriptCore/yarr/YarrParser.h]

1	/*
2	* Copyright (C) 2009-2019 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#pragma once
27
28	#include "Yarr.h"
29	#include "YarrPattern.h"
30	#include "YarrUnicodeProperties.h"
31	#include <wtf/ASCIICType.h>
32	#include <wtf/HashSet.h>
33	#include <wtf/Optional.h>
34	#include <wtf/text/StringBuilder.h>
35	#include <wtf/text/WTFString.h>
36
37	namespace JSC { namespace Yarr {
38
39	// The Parser class should not be used directly - only via the Yarr::parse() method.
40	template<class Delegate, typename CharType>
41	class Parser {
42	private:
43	template<class FriendDelegate>
44	friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit);
45
46	/*
47	* CharacterClassParserDelegate:
48	*
49	* The class CharacterClassParserDelegate is used in the parsing of character
50	* classes. This class handles detection of character ranges. This class
51	* implements enough of the delegate interface such that it can be passed to
52	* parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
53	* to perform the parsing of escape characters in character sets.
54	*/
55	class CharacterClassParserDelegate {
56	public:
57	CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
58	: m_delegate(delegate)
59	, m_errorCode(err)
60	, m_state(Empty)
61	, m_character(`0`)
62	{
63	}
64
65	/*
66	* begin():
67	*
68	* Called at beginning of construction.
69	*/
70	void begin(bool invert)
71	{
72	m_delegate.atomCharacterClassBegin(invert);
73	}
74
75	/*
76	* atomPatternCharacter():
77	*
78	* This method is called either from parseCharacterClass() (for an unescaped
79	* character in a character class), or from parseEscape(). In the former case
80	* the value true will be passed for the argument 'hyphenIsRange', and in this
81	* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
82	* is different to /[a\-z]/).
83	*/
84	void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false)
85	{
86	switch (m_state) {
87	case AfterCharacterClass:
88	// Following a builtin character class we need look out for a hyphen.
89	// We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
90	// If we see a hyphen following a charater class then unlike usual
91	// we'll report it to the delegate immediately, and put ourself into
92	// a poisoned state. Any following calls to add another character or
93	// character class will result in an error. (A hypen following a
94	// character-class is itself valid, but only at the end of a regex).
95	if (hyphenIsRange && ch == `'-'`) {
96	m_delegate.atomCharacterClassAtom(`'-'`);
97	m_state = AfterCharacterClassHyphen;
98	return;
99	}
100	// Otherwise just fall through - cached character so treat this as Empty.
101	FALLTHROUGH;
102
103	case Empty:
104	m_character = ch;
105	m_state = CachedCharacter;
106	return;
107
108	case CachedCharacter:
109	if (hyphenIsRange && ch == `'-'`)
110	m_state = CachedCharacterHyphen;
111	else {
112	m_delegate.atomCharacterClassAtom(m_character);
113	m_character = ch;
114	}
115	return;
116
117	case CachedCharacterHyphen:
118	if (ch < m_character) {
119	m_errorCode = ErrorCode::CharacterClassOutOfOrder;
120	return;
121	}
122	m_delegate.atomCharacterClassRange(m_character, ch);
123	m_state = Empty;
124	return;
125
126	// See coment in atomBuiltInCharacterClass below.
127	// This too is technically an error, per ECMA-262, and again we
128	// we chose to allow this. Note a subtlely here that while we
129	// diverge from the spec's definition of CharacterRange we do
130	// remain in compliance with the grammar. For example, consider
131	// the expression /[\d-a-z]/. We comply with the grammar in
132	// this case by not allowing a-z to be matched as a range.
133	case AfterCharacterClassHyphen:
134	m_delegate.atomCharacterClassAtom(ch);
135	m_state = Empty;
136	return;
137	}
138	}
139
140	/*
141	* atomBuiltInCharacterClass():
142	*
143	* Adds a built-in character class, called by parseEscape().
144	*/
145	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
146	{
147	switch (m_state) {
148	case CachedCharacter:
149	// Flush the currently cached character, then fall through.
150	m_delegate.atomCharacterClassAtom(m_character);
151	FALLTHROUGH;
152	case Empty:
153	case AfterCharacterClass:
154	m_state = AfterCharacterClass;
155	m_delegate.atomCharacterClassBuiltIn(classID, invert);
156	return;
157
158	// If we hit either of these cases, we have an invalid range that
159	// looks something like /[x-\d]/ or /[\d-\d]/.
160	// According to ECMA-262 this should be a syntax error, but
161	// empirical testing shows this to break teh webz. Instead we
162	// comply with to the ECMA-262 grammar, and assume the grammar to
163	// have matched the range correctly, but tweak our interpretation
164	// of CharacterRange. Effectively we implicitly handle the hyphen
165	// as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
166	case CachedCharacterHyphen:
167	m_delegate.atomCharacterClassAtom(m_character);
168	m_delegate.atomCharacterClassAtom(`'-'`);
169	FALLTHROUGH;
170	case AfterCharacterClassHyphen:
171	m_delegate.atomCharacterClassBuiltIn(classID, invert);
172	m_state = Empty;
173	return;
174	}
175	}
176
177	/*
178	* end():
179	*
180	* Called at end of construction.
181	*/
182	void end()
183	{
184	if (m_state == CachedCharacter)
185	m_delegate.atomCharacterClassAtom(m_character);
186	else if (m_state == CachedCharacterHyphen) {
187	m_delegate.atomCharacterClassAtom(m_character);
188	m_delegate.atomCharacterClassAtom(`'-'`);
189	}
190	m_delegate.atomCharacterClassEnd();
191	}
192
193	// parseEscape() should never call these delegate methods when
194	// invoked with inCharacterClass set.
195	NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
196	NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
197	NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
198	NO_RETURN_DUE_TO_ASSERT bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
199	NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
200
201	private:
202	Delegate& m_delegate;
203	ErrorCode& m_errorCode;
204	enum CharacterClassConstructionState {
205	Empty,
206	CachedCharacter,
207	CachedCharacterHyphen,
208	AfterCharacterClass,
209	AfterCharacterClassHyphen,
210	} m_state;
211	UChar32 m_character;
212	};
213
214	Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit)
215	: m_delegate(delegate)
216	, m_backReferenceLimit(backReferenceLimit)
217	, m_data(pattern.characters<CharType>())
218	, m_size(pattern.length())
219	, m_isUnicode(isUnicode)
220	{
221	}
222
223	// The handling of IdentityEscapes is different depending on the unicode flag.
224	// For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
225	// For non-unicode patterns, most any character can be escaped.
226	bool isIdentityEscapeAnError(int ch)
227	{
228	if (m_isUnicode && !strchr("^$\\.*+?()[]{}\|/", ch)) {
229	m_errorCode = ErrorCode::InvalidIdentityEscape;
230	return true;
231	}
232
233	return false;
234	}
235
236	/*
237	* parseEscape():
238	*
239	* Helper for parseTokens() AND parseCharacterClass().
240	* Unlike the other parser methods, this function does not report tokens
241	* directly to the member delegate (m_delegate), instead tokens are
242	* emitted to the delegate provided as an argument. In the case of atom
243	* escapes, parseTokens() will call parseEscape() passing m_delegate as
244	* an argument, and as such the escape will be reported to the delegate.
245	*
246	* However this method may also be used by parseCharacterClass(), in which
247	* case a CharacterClassParserDelegate will be passed as the delegate that
248	* tokens should be added to. A boolean flag is also provided to indicate
249	* whether that an escape in a CharacterClass is being parsed (some parsing
250	* rules change in this context).
251	*
252	* The boolean value returned by this method indicates whether the token
253	* parsed was an atom (outside of a characted class \b and \B will be
254	* interpreted as assertions).
255	*/
256	template<bool inCharacterClass, class EscapeDelegate>
257	bool parseEscape(EscapeDelegate& delegate)
258	{
259	ASSERT(!hasError(m_errorCode));
260	ASSERT(peek() == `'\\'`);
261	consume();
262
263	if (atEndOfPattern()) {
264	m_errorCode = ErrorCode::EscapeUnterminated;
265	return false;
266	}
267
268	switch (peek()) {
269	// Assertions
270	case `'b'`:
271	consume();
272	if (inCharacterClass) {
273	if (isIdentityEscapeAnError(`'b'`))
274	break;
275
276	delegate.atomPatternCharacter(`'\b'`);
277	} else {
278	delegate.assertionWordBoundary(false);
279	return false;
280	}
281	break;
282	case `'B'`:
283	consume();
284	if (inCharacterClass) {
285	if (isIdentityEscapeAnError(`'B'`))
286	break;
287
288	delegate.atomPatternCharacter(`'B'`);
289	} else {
290	delegate.assertionWordBoundary(true);
291	return false;
292	}
293	break;
294
295	// CharacterClassEscape
296	case `'d'`:
297	consume();
298	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false);
299	break;
300	case `'s'`:
301	consume();
302	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false);
303	break;
304	case `'w'`:
305	consume();
306	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false);
307	break;
308	case `'D'`:
309	consume();
310	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true);
311	break;
312	case `'S'`:
313	consume();
314	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true);
315	break;
316	case `'W'`:
317	consume();
318	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true);
319	break;
320
321	// DecimalEscape
322	case `'1'`:
323	case `'2'`:
324	case `'3'`:
325	case `'4'`:
326	case `'5'`:
327	case `'6'`:
328	case `'7'`:
329	case `'8'`:
330	case `'9'`: {
331	// To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332	// First, try to parse this as backreference.
333	if (!inCharacterClass) {
334	ParseState state = saveState();
335
336	unsigned backReference = consumeNumber();
337	if (backReference <= m_backReferenceLimit) {
338	delegate.atomBackReference(backReference);
339	break;
340	}
341
342	restoreState(state);
343
344	if (m_isUnicode) {
345	m_errorCode = ErrorCode::InvalidBackreference;
346	return false;
347	}
348	}
349
350	// Not a backreference, and not octal. Just a number.
351	if (peek() >= `'8'`) {
352	delegate.atomPatternCharacter(consume());
353	break;
354	}
355
356	// Fall-through to handle this as an octal escape.
357	FALLTHROUGH;
358	}
359
360	// Octal escape
361	case `'0'`:
362	delegate.atomPatternCharacter(consumeOctal());
363	break;
364
365	// ControlEscape
366	case `'f'`:
367	consume();
368	delegate.atomPatternCharacter(`'\f'`);
369	break;
370	case `'n'`:
371	consume();
372	delegate.atomPatternCharacter(`'\n'`);
373	break;
374	case `'r'`:
375	consume();
376	delegate.atomPatternCharacter(`'\r'`);
377	break;
378	case `'t'`:
379	consume();
380	delegate.atomPatternCharacter(`'\t'`);
381	break;
382	case `'v'`:
383	consume();
384	delegate.atomPatternCharacter(`'\v'`);
385	break;
386
387	// ControlLetter
388	case `'c'`: {
389	ParseState state = saveState();
390	consume();
391	if (!atEndOfPattern()) {
392	int control = consume();
393
394	// To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
395	if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) \|\| (control == `'_'`) : WTF::isASCIIAlpha(control)) {
396	delegate.atomPatternCharacter(control & `0x1f`);
397	break;
398	}
399	}
400	restoreState(state);
401	delegate.atomPatternCharacter(`'\\'`);
402	break;
403	}
404
405	// HexEscape
406	case `'x'`: {
407	consume();
408	int x = tryConsumeHex(`2`);
409	if (x == -`1`) {
410	if (isIdentityEscapeAnError(`'x'`))
411	break;
412
413	delegate.atomPatternCharacter(`'x'`);
414	} else
415	delegate.atomPatternCharacter(x);
416	break;
417	}
418
419	// Named backreference
420	case `'k'`: {
421	consume();
422	ParseState state = saveState();
423	if (!atEndOfPattern() && !inCharacterClass) {
424	if (consume() == `'<'`) {
425	auto groupName = tryConsumeGroupName();
426	if (groupName) {
427	if (m_captureGroupNames.contains(groupName.value())) {
428	delegate.atomNamedBackReference(groupName.value());
429	break;
430	}
431
432	if (delegate.isValidNamedForwardReference(groupName.value())) {
433	delegate.atomNamedForwardReference(groupName.value());
434	break;
435	}
436	}
437	if (m_isUnicode) {
438	m_errorCode = ErrorCode::InvalidBackreference;
439	break;
440	}
441	}
442	}
443	restoreState(state);
444	delegate.atomPatternCharacter(`'k'`);
445	break;
446	}
447
448	// Unicode property escapes
449	case `'p'`:
450	case `'P'`: {
451	int escapeChar = consume();
452
453	if (!m_isUnicode) {
454	if (isIdentityEscapeAnError(escapeChar))
455	break;
456	delegate.atomPatternCharacter(escapeChar);
457	break;
458	}
459
460	if (!atEndOfPattern() && peek() == `'{'`) {
461	consume();
462	auto optClassID = tryConsumeUnicodePropertyExpression();
463	if (!optClassID) {
464	// tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression
465	break;
466	}
467	delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == `'P'`);
468	} else
469	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
470	break;
471	}
472
473	// UnicodeEscape
474	case `'u'`: {
475	consume();
476	if (atEndOfPattern()) {
477	if (isIdentityEscapeAnError(`'u'`))
478	break;
479
480	delegate.atomPatternCharacter(`'u'`);
481	break;
482	}
483
484	if (m_isUnicode && peek() == `'{'`) {
485	consume();
486	UChar32 codePoint = `0`;
487	do {
488	if (atEndOfPattern() \|\| !isASCIIHexDigit(peek())) {
489	m_errorCode = ErrorCode::InvalidUnicodeEscape;
490	break;
491	}
492
493	codePoint = (codePoint << `4`) \| toASCIIHexValue(consume());
494
495	if (codePoint > UCHAR_MAX_VALUE)
496	m_errorCode = ErrorCode::InvalidUnicodeEscape;
497	} while (!atEndOfPattern() && peek() != `'}'`);
498	if (!atEndOfPattern() && peek() == `'}'`)
499	consume();
500	else if (!hasError(m_errorCode))
501	m_errorCode = ErrorCode::InvalidUnicodeEscape;
502	if (hasError(m_errorCode))
503	return false;
504
505	delegate.atomPatternCharacter(codePoint);
506	break;
507	}
508	int u = tryConsumeHex(`4`);
509	if (u == -`1`) {
510	if (isIdentityEscapeAnError(`'u'`))
511	break;
512
513	delegate.atomPatternCharacter(`'u'`);
514	} else {
515	// If we have the first of a surrogate pair, look for the second.
516	if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= `6`) && peek() == `'\\'`) {
517	ParseState state = saveState();
518	consume();
519
520	if (tryConsume(`'u'`)) {
521	int surrogate2 = tryConsumeHex(`4`);
522	if (U16_IS_TRAIL(surrogate2)) {
523	u = U16_GET_SUPPLEMENTARY(u, surrogate2);
524	delegate.atomPatternCharacter(u);
525	break;
526	}
527	}
528
529	restoreState(state);
530	}
531	delegate.atomPatternCharacter(u);
532	}
533	break;
534	}
535
536	// IdentityEscape
537	default:
538	int ch = peek();
539
540	if (ch == `'-'` && m_isUnicode && inCharacterClass) {
541	// \- is allowed for ClassEscape with unicode flag.
542	delegate.atomPatternCharacter(consume());
543	break;
544	}
545
546	if (isIdentityEscapeAnError(ch))
547	break;
548
549	delegate.atomPatternCharacter(consume());
550	}
551
552	return true;
553	}
554
555	UChar32 consumePossibleSurrogatePair()
556	{
557	UChar32 ch = consume();
558	if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > `0`)) {
559	ParseState state = saveState();
560
561	UChar32 surrogate2 = consume();
562	if (U16_IS_TRAIL(surrogate2))
563	ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
564	else
565	restoreState(state);
566	}
567
568	return ch;
569	}
570
571	/*
572	* parseAtomEscape(), parseCharacterClassEscape():
573	*
574	* These methods alias to parseEscape().
575	*/
576	bool parseAtomEscape()
577	{
578	return parseEscape<false>(m_delegate);
579	}
580	void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
581	{
582	parseEscape<true>(delegate);
583	}
584
585	/*
586	* parseCharacterClass():
587	*
588	* Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape)
589	* to an instance of CharacterClassParserDelegate, to describe the character class to the
590	* delegate.
591	*/
592	void parseCharacterClass()
593	{
594	ASSERT(!hasError(m_errorCode));
595	ASSERT(peek() == `'['`);
596	consume();
597
598	CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode);
599
600	characterClassConstructor.begin(tryConsume(`'^'`));
601
602	while (!atEndOfPattern()) {
603	switch (peek()) {
604	case `']'`:
605	consume();
606	characterClassConstructor.end();
607	return;
608
609	case `'\\'`:
610	parseCharacterClassEscape(characterClassConstructor);
611	break;
612
613	default:
614	characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true);
615	}
616
617	if (hasError(m_errorCode))
618	return;
619	}
620
621	m_errorCode = ErrorCode::CharacterClassUnmatched;
622	}
623
624	/*
625	* parseParenthesesBegin():
626	*
627	* Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
628	*/
629	void parseParenthesesBegin()
630	{
631	ASSERT(!hasError(m_errorCode));
632	ASSERT(peek() == `'('`);
633	consume();
634
635	if (tryConsume(`'?'`)) {
636	if (atEndOfPattern()) {
637	m_errorCode = ErrorCode::ParenthesesTypeInvalid;
638	return;
639	}
640
641	switch (consume()) {
642	case `':'`:
643	m_delegate.atomParenthesesSubpatternBegin(false);
644	break;
645
646	case `'='`:
647	m_delegate.atomParentheticalAssertionBegin();
648	break;
649
650	case `'!'`:
651	m_delegate.atomParentheticalAssertionBegin(true);
652	break;
653
654	case `'<'`: {
655	auto groupName = tryConsumeGroupName();
656	if (groupName) {
657	auto setAddResult = m_captureGroupNames.add(groupName.value());
658	if (setAddResult.isNewEntry)
659	m_delegate.atomParenthesesSubpatternBegin(true, groupName);
660	else
661	m_errorCode = ErrorCode::DuplicateGroupName;
662	} else
663	m_errorCode = ErrorCode::InvalidGroupName;
664
665	break;
666	}
667
668	default:
669	m_errorCode = ErrorCode::ParenthesesTypeInvalid;
670	}
671	} else
672	m_delegate.atomParenthesesSubpatternBegin();
673
674	++m_parenthesesNestingDepth;
675	}
676
677	/*
678	* parseParenthesesEnd():
679	*
680	* Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
681	*/
682	void parseParenthesesEnd()
683	{
684	ASSERT(!hasError(m_errorCode));
685	ASSERT(peek() == `')'`);
686	consume();
687
688	if (m_parenthesesNestingDepth > `0`)
689	m_delegate.atomParenthesesEnd();
690	else
691	m_errorCode = ErrorCode::ParenthesesUnmatched;
692
693	--m_parenthesesNestingDepth;
694	}
695
696	/*
697	* parseQuantifier():
698	*
699	* Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
700	*/
701	void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
702	{
703	ASSERT(!hasError(m_errorCode));
704	ASSERT(min <= max);
705
706	if (min == UINT_MAX) {
707	m_errorCode = ErrorCode::QuantifierTooLarge;
708	return;
709	}
710
711	if (lastTokenWasAnAtom)
712	m_delegate.quantifyAtom(min, max, !tryConsume(`'?'`));
713	else
714	m_errorCode = ErrorCode::QuantifierWithoutAtom;
715	}
716
717	/*
718	* parseTokens():
719	*
720	* This method loops over the input pattern reporting tokens to the delegate.
721	* The method returns when a parse error is detected, or the end of the pattern
722	* is reached. One piece of state is tracked around the loop, which is whether
723	* the last token passed to the delegate was an atom (this is necessary to detect
724	* a parse error when a quantifier provided without an atom to quantify).
725	*/
726	void parseTokens()
727	{
728	bool lastTokenWasAnAtom = false;
729
730	while (!atEndOfPattern()) {
731	switch (peek()) {
732	case `'\|'`:
733	consume();
734	m_delegate.disjunction();
735	lastTokenWasAnAtom = false;
736	break;
737
738	case `'('`:
739	parseParenthesesBegin();
740	lastTokenWasAnAtom = false;
741	break;
742
743	case `')'`:
744	parseParenthesesEnd();
745	lastTokenWasAnAtom = true;
746	break;
747
748	case `'^'`:
749	consume();
750	m_delegate.assertionBOL();
751	lastTokenWasAnAtom = false;
752	break;
753
754	case `'$'`:
755	consume();
756	m_delegate.assertionEOL();
757	lastTokenWasAnAtom = false;
758	break;
759
760	case `'.'`:
761	consume();
762	m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false);
763	lastTokenWasAnAtom = true;
764	break;
765
766	case `'['`:
767	parseCharacterClass();
768	lastTokenWasAnAtom = true;
769	break;
770
771	case `'\\'`:
772	lastTokenWasAnAtom = parseAtomEscape();
773	break;
774
775	case `'*'`:
776	consume();
777	parseQuantifier(lastTokenWasAnAtom, `0`, quantifyInfinite);
778	lastTokenWasAnAtom = false;
779	break;
780
781	case `'+'`:
782	consume();
783	parseQuantifier(lastTokenWasAnAtom, `1`, quantifyInfinite);
784	lastTokenWasAnAtom = false;
785	break;
786
787	case `'?'`:
788	consume();
789	parseQuantifier(lastTokenWasAnAtom, `0`, `1`);
790	lastTokenWasAnAtom = false;
791	break;
792
793	case `'{'`: {
794	ParseState state = saveState();
795
796	consume();
797	if (peekIsDigit()) {
798	unsigned min = consumeNumber();
799	unsigned max = min;
800
801	if (tryConsume(`','`))
802	max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
803
804	if (tryConsume(`'}'`)) {
805	if (min <= max)
806	parseQuantifier(lastTokenWasAnAtom, min, max);
807	else
808	m_errorCode = ErrorCode::QuantifierOutOfOrder;
809	lastTokenWasAnAtom = false;
810	break;
811	}
812	}
813
814	restoreState(state);
815	}
816	// if we did not find a complete quantifer, fall through to the default case.
817	FALLTHROUGH;
818
819	default:
820	m_delegate.atomPatternCharacter(consumePossibleSurrogatePair());
821	lastTokenWasAnAtom = true;
822	}
823
824	if (hasError(m_errorCode))
825	return;
826	}
827
828	if (m_parenthesesNestingDepth > `0`)
829	m_errorCode = ErrorCode::MissingParentheses;
830	}
831
832	/*
833	* parse():
834	*
835	* This method calls parseTokens() to parse over the input and returns error code for a result.
836	*/
837	ErrorCode parse()
838	{
839	if (m_size > MAX_PATTERN_SIZE)
840	m_errorCode = ErrorCode::PatternTooLarge;
841	else
842	parseTokens();
843	ASSERT(atEndOfPattern() \|\| hasError(m_errorCode));
844
845	return m_errorCode;
846	}
847
848	// Misc helper functions:
849
850	typedef unsigned ParseState;
851
852	ParseState saveState()
853	{
854	return m_index;
855	}
856
857	void restoreState(ParseState state)
858	{
859	m_index = state;
860	}
861
862	bool atEndOfPattern()
863	{
864	ASSERT(m_index <= m_size);
865	return m_index == m_size;
866	}
867
868	unsigned patternRemaining()
869	{
870	ASSERT(m_index <= m_size);
871	return m_size - m_index;
872	}
873
874	int peek()
875	{
876	ASSERT(m_index < m_size);
877	return m_data[m_index];
878	}
879
880	bool peekIsDigit()
881	{
882	return !atEndOfPattern() && WTF::isASCIIDigit(peek());
883	}
884
885	unsigned peekDigit()
886	{
887	ASSERT(peekIsDigit());
888	return peek() - `'0'`;
889	}
890
891	int tryConsumeUnicodeEscape()
892	{
893	if (!tryConsume(`'u'`))
894	return -`1`;
895
896	if (m_isUnicode && tryConsume(`'{'`)) {
897	int codePoint = `0`;
898	do {
899	if (atEndOfPattern() \|\| !isASCIIHexDigit(peek())) {
900	m_errorCode = ErrorCode::InvalidUnicodeEscape;
901	return -`1`;
902	}
903
904	codePoint = (codePoint << `4`) \| toASCIIHexValue(consume());
905
906	if (codePoint > UCHAR_MAX_VALUE) {
907	m_errorCode = ErrorCode::InvalidUnicodeEscape;
908	return -`1`;
909	}
910	} while (!atEndOfPattern() && peek() != `'}'`);
911	if (!atEndOfPattern() && peek() == `'}'`)
912	consume();
913	else if (!hasError(m_errorCode))
914	m_errorCode = ErrorCode::InvalidUnicodeEscape;
915	if (hasError(m_errorCode))
916	return -`1`;
917
918	return codePoint;
919	}
920
921	int u = tryConsumeHex(`4`);
922	if (u == -`1`)
923	return -`1`;
924
925	// If we have the first of a surrogate pair, look for the second.
926	if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= `6`) && peek() == `'\\'`) {
927	ParseState state = saveState();
928	consume();
929
930	if (tryConsume(`'u'`)) {
931	int surrogate2 = tryConsumeHex(`4`);
932	if (U16_IS_TRAIL(surrogate2)) {
933	u = U16_GET_SUPPLEMENTARY(u, surrogate2);
934	return u;
935	}
936	}
937
938	restoreState(state);
939	}
940
941	return u;
942	}
943
944	int tryConsumeIdentifierCharacter()
945	{
946	int ch = peek();
947
948	if (ch == `'\\'`) {
949	consume();
950	ch = tryConsumeUnicodeEscape();
951	} else
952	consume();
953
954	return ch;
955	}
956
957	bool isIdentifierStart(int ch)
958	{
959	return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) \|\| ch == `'_'` \|\| ch == `'$'`)) \|\| (U_GET_GC_MASK(ch) & U_GC_L_MASK);
960	}
961
962	bool isIdentifierPart(int ch)
963	{
964	return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) \|\| ch == `'_'` \|\| ch == `'$'`)) \|\| (U_GET_GC_MASK(ch) & (U_GC_L_MASK \| U_GC_MN_MASK \| U_GC_MC_MASK \| U_GC_ND_MASK \| U_GC_PC_MASK)) \|\| ch == `0x200C` \|\| ch == `0x200D`;
965	}
966
967	bool isUnicodePropertyValueExpressionChar(int ch)
968	{
969	return WTF::isASCIIAlphanumeric(ch) \|\| ch == `'_'` \|\| ch == `'='`;
970	}
971
972	int consume()
973	{
974	ASSERT(m_index < m_size);
975	return m_data[m_index++];
976	}
977
978	unsigned consumeDigit()
979	{
980	ASSERT(peekIsDigit());
981	return consume() - `'0'`;
982	}
983
984	unsigned consumeNumber()
985	{
986	Checked<unsigned, RecordOverflow> n = consumeDigit();
987	while (peekIsDigit())
988	n = n * `10` + consumeDigit();
989	return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet();
990	}
991
992	unsigned consumeOctal()
993	{
994	ASSERT(WTF::isASCIIOctalDigit(peek()));
995
996	unsigned n = consumeDigit();
997	while (n < `32` && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
998	n = n * `8` + consumeDigit();
999	return n;
1000	}
1001
1002	bool tryConsume(UChar ch)
1003	{
1004	if (atEndOfPattern() \|\| (m_data[m_index] != ch))
1005	return false;
1006	++m_index;
1007	return true;
1008	}
1009
1010	int tryConsumeHex(int count)
1011	{
1012	ParseState state = saveState();
1013
1014	int n = `0`;
1015	while (count--) {
1016	if (atEndOfPattern() \|\| !WTF::isASCIIHexDigit(peek())) {
1017	restoreState(state);
1018	return -`1`;
1019	}
1020	n = (n << `4`) \| WTF::toASCIIHexValue(consume());
1021	}
1022	return n;
1023	}
1024
1025	Optional<String> tryConsumeGroupName()
1026	{
1027	if (atEndOfPattern())
1028	return WTF::nullopt;
1029
1030	ParseState state = saveState();
1031
1032	int ch = tryConsumeIdentifierCharacter();
1033
1034	if (isIdentifierStart(ch)) {
1035	StringBuilder identifierBuilder;
1036	identifierBuilder.appendCharacter(ch);
1037
1038	while (!atEndOfPattern()) {
1039	ch = tryConsumeIdentifierCharacter();
1040	if (ch == `'>'`)
1041	return Optional<String>(identifierBuilder.toString());
1042
1043	if (!isIdentifierPart(ch))
1044	break;
1045
1046	identifierBuilder.appendCharacter(ch);
1047	}
1048	}
1049
1050	restoreState(state);
1051
1052	return WTF::nullopt;
1053	}
1054
1055	Optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression()
1056	{
1057	if (atEndOfPattern() \|\| !isUnicodePropertyValueExpressionChar(peek())) {
1058	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1059	return WTF::nullopt;
1060	}
1061
1062	StringBuilder expressionBuilder;
1063	String unicodePropertyName;
1064	bool foundEquals = false;
1065	unsigned errors = `0`;
1066
1067	expressionBuilder.appendCharacter(consume());
1068
1069	while (!atEndOfPattern()) {
1070	int ch = peek();
1071	if (ch == `'}'`) {
1072	consume();
1073	if (errors) {
1074	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1075	return WTF::nullopt;
1076	}
1077
1078	if (foundEquals) {
1079	auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString());
1080	if (!result)
1081	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1082	return result;
1083	}
1084
1085	auto result = unicodeMatchProperty(expressionBuilder.toString());
1086	if (!result)
1087	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1088	return result;
1089	}
1090
1091	consume();
1092	if (ch == `'='`) {
1093	if (!foundEquals) {
1094	foundEquals = true;
1095	unicodePropertyName = expressionBuilder.toString();
1096	expressionBuilder.clear();
1097	} else
1098	errors++;
1099	} else if (!isUnicodePropertyValueExpressionChar(ch))
1100	errors++;
1101	else
1102	expressionBuilder.appendCharacter(ch);
1103	}
1104
1105	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1106	return WTF::nullopt;
1107	}
1108
1109	Delegate& m_delegate;
1110	unsigned m_backReferenceLimit;
1111	ErrorCode m_errorCode { ErrorCode::NoError };
1112	const CharType* m_data;
1113	unsigned m_size;
1114	unsigned m_index { `0` };
1115	bool m_isUnicode;
1116	unsigned m_parenthesesNestingDepth { `0` };
1117	HashSet<String> m_captureGroupNames;
1118
1119	// Derived by empirical testing of compile time in PCRE and WREC.
1120	static constexpr unsigned MAX_PATTERN_SIZE = `1024` * `1024`;
1121	};
1122
1123	/*
1124	* Yarr::parse():
1125	*
1126	* The parse method is passed a pattern to be parsed and a delegate upon which
1127	* callbacks will be made to record the parsed tokens forming the regex.
1128	* Yarr::parse() returns null on success, or a const C string providing an error
1129	* message where a parse error occurs.
1130	*
1131	* The Delegate must implement the following interface:
1132	*
1133	* void assertionBOL();
1134	* void assertionEOL();
1135	* void assertionWordBoundary(bool invert);
1136	*
1137	* void atomPatternCharacter(UChar32 ch);
1138	* void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
1139	* void atomCharacterClassBegin(bool invert)
1140	* void atomCharacterClassAtom(UChar32 ch)
1141	* void atomCharacterClassRange(UChar32 begin, UChar32 end)
1142	* void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
1143	* void atomCharacterClassEnd()
1144	* void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName);
1145	* void atomParentheticalAssertionBegin(bool invert = false);
1146	* void atomParenthesesEnd();
1147	* void atomBackReference(unsigned subpatternId);
1148	* void atomNamedBackReference(const String& subpatternName);
1149	* bool isValidNamedForwardReference(const String& subpatternName);
1150	* void atomNamedForwardReference(const String& subpatternName);
1151	*
1152	* void quantifyAtom(unsigned min, unsigned max, bool greedy);
1153	*
1154	* void disjunction();
1155	*
1156	* The regular expression is described by a sequence of assertion() and atom()
1157	* callbacks to the delegate, describing the terms in the regular expression.
1158	* Following an atom a quantifyAtom() call may occur to indicate that the previous
1159	* atom should be quantified. In the case of atoms described across multiple
1160	* calls (parentheses and character classes) the call to quantifyAtom() will come
1161	* after the call to the atomEnd() method, never after atomBegin().
1162	*
1163	* Character classes may either be described by a single call to
1164	* atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
1165	* In the latter case, ...Begin() will be called, followed by a sequence of
1166	* calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
1167	*
1168	* Sequences of atoms and assertions are broken into alternatives via calls to
1169	* disjunction(). Assertions, atoms, and disjunctions emitted between calls to
1170	* atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
1171	* atomParenthesesBegin() is passed a subpatternId. In the case of a regular
1172	* capturing subpattern, this will be the subpatternId associated with these
1173	* parentheses, and will also by definition be the lowest subpatternId of these
1174	* parentheses and of any nested paretheses. The atomParenthesesEnd() method
1175	* is passed the subpatternId of the last capturing subexpression nested within
1176	* these paretheses. In the case of a capturing subpattern with no nested
1177	* capturing subpatterns, the same subpatternId will be passed to the begin and
1178	* end functions. In the case of non-capturing subpatterns the subpatternId
1179	* passed to the begin method is also the first possible subpatternId that might
1180	* be nested within these paretheses. If a set of non-capturing parentheses does
1181	* not contain any capturing subpatterns, then the subpatternId passed to begin
1182	* will be greater than the subpatternId passed to end.
1183	*/
1184
1185	template<class Delegate>
1186	ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite)
1187	{
1188	if (pattern.is8Bit())
1189	return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1190	return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1191	}
1192
1193	} } // namespace JSC::Yarr
1194

Browse the source code of jsc/Source/JavaScriptCore/yarr/YarrParser.h