YarrPattern.cpp source code [jsc/Source/JavaScriptCore/yarr/YarrPattern.cpp]

1	/*
2	* Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3	* Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#include "config.h"
28	#include "YarrPattern.h"
29
30	#include "Options.h"
31	#include "Yarr.h"
32	#include "YarrCanonicalize.h"
33	#include "YarrParser.h"
34	#include <wtf/DataLog.h>
35	#include <wtf/Optional.h>
36	#include <wtf/StackPointer.h>
37	#include <wtf/Threading.h>
38	#include <wtf/Vector.h>
39
40	namespace JSC { namespace Yarr {
41
42	#include "RegExpJitTables.h"
43
44	class CharacterClassConstructor {
45	public:
46	CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
47	: m_isCaseInsensitive(isCaseInsensitive)
48	, m_anyCharacter(false)
49	, m_characterWidths(CharacterClassWidths::Unknown)
50	, m_canonicalMode(canonicalMode)
51	{
52	}
53
54	void reset()
55	{
56	m_matches.clear();
57	m_ranges.clear();
58	m_matchesUnicode.clear();
59	m_rangesUnicode.clear();
60	m_anyCharacter = false;
61	m_characterWidths = CharacterClassWidths::Unknown;
62	}
63
64	void append(const CharacterClass* other)
65	{
66	for (size_t i = `0`; i < other->m_matches.size(); ++i)
67	addSorted(m_matches, other->m_matches [i]);
68	for (size_t i = `0`; i < other->m_ranges.size(); ++i)
69	addSortedRange(m_ranges, other->m_ranges [i].begin, other->m_ranges [i].end);
70	for (size_t i = `0`; i < other->m_matchesUnicode.size(); ++i)
71	addSorted(m_matchesUnicode, other->m_matchesUnicode [i]);
72	for (size_t i = `0`; i < other->m_rangesUnicode.size(); ++i)
73	addSortedRange(m_rangesUnicode, other->m_rangesUnicode [i].begin, other->m_rangesUnicode [i].end);
74	}
75
76	void appendInverted(const CharacterClass* other)
77	{
78	auto addSortedInverted = [&](UChar32 min, UChar32 max,
79	const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
80	Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
81
82	auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
83	if (lo < hiPlusOne) {
84	if (lo + `1` == hiPlusOne)
85	addSorted(destMatches, lo);
86	else
87	addSortedRange(destRanges, lo, hiPlusOne - `1`);
88	}
89	};
90
91	UChar32 lo = min;
92	size_t matchesIndex = `0`;
93	size_t rangesIndex = `0`;
94	bool matchesRemaining = matchesIndex < srcMatches.size();
95	bool rangesRemaining = rangesIndex < srcRanges.size();
96
97	if (!matchesRemaining && !rangesRemaining) {
98	addSortedMatchOrRange(min, max + `1`);
99	return;
100	}
101
102	while (matchesRemaining \|\| rangesRemaining) {
103	UChar32 hiPlusOne;
104	UChar32 nextLo;
105
106	if (matchesRemaining
107	&& (!rangesRemaining \|\| srcMatches [matchesIndex] < srcRanges [rangesIndex].begin)) {
108	hiPlusOne = srcMatches [matchesIndex];
109	nextLo = hiPlusOne + `1`;
110	++matchesIndex;
111	matchesRemaining = matchesIndex < srcMatches.size();
112	} else {
113	hiPlusOne = srcRanges [rangesIndex].begin;
114	nextLo = srcRanges [rangesIndex].end + `1`;
115	++rangesIndex;
116	rangesRemaining = rangesIndex < srcRanges.size();
117	}
118
119	addSortedMatchOrRange(lo, hiPlusOne);
120
121	lo = nextLo;
122	}
123
124	addSortedMatchOrRange(lo, max + `1`);
125	};
126
127	addSortedInverted(`0`, `0x7f`, other->m_matches, other->m_ranges, m_matches, m_ranges);
128	addSortedInverted(`0x80`, `0x10ffff`, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
129	}
130
131	void putChar(UChar32 ch)
132	{
133	if (!m_isCaseInsensitive) {
134	addSorted(ch);
135	return;
136	}
137
138	if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
139	// Handle ASCII cases.
140	if (isASCIIAlpha(ch)) {
141	addSorted(m_matches, toASCIIUpper(ch));
142	addSorted(m_matches, toASCIILower(ch));
143	} else
144	addSorted(m_matches, ch);
145	return;
146	}
147
148	// Add multiple matches, if necessary.
149	const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
150	if (info->type == CanonicalizeUnique)
151	addSorted(ch);
152	else
153	putUnicodeIgnoreCase(ch, info);
154	}
155
156	void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
157	{
158	ASSERT(m_isCaseInsensitive);
159	ASSERT(ch >= info->begin && ch <= info->end);
160	ASSERT(info->type != CanonicalizeUnique);
161	if (info->type == CanonicalizeSet) {
162	for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
163	addSorted(ch);
164	} else {
165	addSorted(ch);
166	addSorted(getCanonicalPair(info, ch));
167	}
168	}
169
170	void putRange(UChar32 lo, UChar32 hi)
171	{
172	if (isASCII(lo)) {
173	char asciiLo = lo;
174	char asciiHi = std::min(hi, (UChar32)`0x7f`);
175	addSortedRange(m_ranges, lo, asciiHi);
176
177	if (m_isCaseInsensitive) {
178	if ((asciiLo <= `'Z'`) && (asciiHi >= `'A'`))
179	addSortedRange(m_ranges, std::max(asciiLo, `'A'`)+(`'a'`-`'A'`), std::min(asciiHi, `'Z'`)+(`'a'`-`'A'`));
180	if ((asciiLo <= `'z'`) && (asciiHi >= `'a'`))
181	addSortedRange(m_ranges, std::max(asciiLo, `'a'`)+(`'A'`-`'a'`), std::min(asciiHi, `'z'`)+(`'A'`-`'a'`));
182	}
183	}
184	if (isASCII(hi))
185	return;
186
187	lo = std::max(lo, (UChar32)`0x80`);
188	addSortedRange(m_rangesUnicode, lo, hi);
189
190	if (!m_isCaseInsensitive)
191	return;
192
193	const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
194	while (true) {
195	// Handle the range [lo .. end]
196	UChar32 end = std::min<UChar32>(info->end, hi);
197
198	switch (info->type) {
199	case CanonicalizeUnique:
200	// Nothing to do - no canonical equivalents.
201	break;
202	case CanonicalizeSet: {
203	UChar ch;
204	for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
205	addSorted(m_matchesUnicode, ch);
206	break;
207	}
208	case CanonicalizeRangeLo:
209	addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
210	break;
211	case CanonicalizeRangeHi:
212	addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
213	break;
214	case CanonicalizeAlternatingAligned:
215	// Use addSortedRange since there is likely an abutting range to combine with.
216	if (lo & `1`)
217	addSortedRange(m_rangesUnicode, lo - `1`, lo - `1`);
218	if (!(end & `1`))
219	addSortedRange(m_rangesUnicode, end + `1`, end + `1`);
220	break;
221	case CanonicalizeAlternatingUnaligned:
222	// Use addSortedRange since there is likely an abutting range to combine with.
223	if (!(lo & `1`))
224	addSortedRange(m_rangesUnicode, lo - `1`, lo - `1`);
225	if (end & `1`)
226	addSortedRange(m_rangesUnicode, end + `1`, end + `1`);
227	break;
228	}
229
230	if (hi == end)
231	return;
232
233	++info;
234	lo = info->begin;
235	};
236
237	}
238
239	std::unique_ptr<CharacterClass> charClass()
240	{
241	coalesceTables();
242
243	auto characterClass = makeUnique<CharacterClass>();
244
245	characterClass ->m_matches.swap(m_matches);
246	characterClass ->m_ranges.swap(m_ranges);
247	characterClass ->m_matchesUnicode.swap(m_matchesUnicode);
248	characterClass ->m_rangesUnicode.swap(m_rangesUnicode);
249	characterClass ->m_anyCharacter = anyCharacter();
250	characterClass ->m_characterWidths = characterWidths();
251
252	m_anyCharacter = false;
253	m_characterWidths = CharacterClassWidths::Unknown;
254
255	return characterClass;
256	}
257
258	private:
259	void addSorted(UChar32 ch)
260	{
261	addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
262	}
263
264	void addSorted(Vector<UChar32>& matches, UChar32 ch)
265	{
266	unsigned pos = `0`;
267	unsigned range = matches.size();
268
269	m_characterWidths \|= (U_IS_BMP(ch) ? CharacterClassWidths::HasBMPChars : CharacterClassWidths::HasNonBMPChars);
270
271	// binary chop, find position to insert char.
272	while (range) {
273	unsigned index = range >> `1`;
274
275	int val = matches [pos+index] - ch;
276	if (!val)
277	return;
278	else if (val > `0`) {
279	if (val == `1`) {
280	UChar32 lo = ch;
281	UChar32 hi = ch + `1`;
282	matches.remove(pos + index);
283	if (pos + index > `0` && matches [pos + index - `1`] == ch - `1`) {
284	lo = ch - `1`;
285	matches.remove(pos + index - `1`);
286	}
287	addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
288	return;
289	}
290	range = index;
291	} else {
292	if (val == -`1`) {
293	UChar32 lo = ch - `1`;
294	UChar32 hi = ch;
295	matches.remove(pos + index);
296	if (pos + index + `1` < matches.size() && matches [pos + index + `1`] == ch + `1`) {
297	hi = ch + `1`;
298	matches.remove(pos + index + `1`);
299	}
300	addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
301	return;
302	}
303	pos += (index+`1`);
304	range -= (index+`1`);
305	}
306	}
307
308	if (pos == matches.size())
309	matches.append(ch);
310	else
311	matches.insert(pos, ch);
312	}
313
314	void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
315	{
316	size_t end = ranges.size();
317
318	if (U_IS_BMP(lo))
319	m_characterWidths \|= CharacterClassWidths::HasBMPChars;
320	if (!U_IS_BMP(hi))
321	m_characterWidths \|= CharacterClassWidths::HasNonBMPChars;
322
323	// Simple linear scan - I doubt there are that many ranges anyway...
324	// feel free to fix this with something faster (eg binary chop).
325	for (size_t i = `0`; i < end; ++i) {
326	// does the new range fall before the current position in the array
327	if (hi < ranges [i].begin) {
328	// Concatenate appending ranges.
329	if (hi == (ranges [i].begin - `1`)) {
330	ranges [i].begin = lo;
331	return;
332	}
333	ranges.insert(i, CharacterRange (lo, hi));
334	return;
335	}
336	// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
337	// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
338	// end of the last range they concatenate, which is just as good.
339	if (lo <= (ranges [i].end + `1`)) {
340	// found an intersect! we'll replace this entry in the array.
341	ranges [i].begin = std::min(ranges [i].begin, lo);
342	ranges [i].end = std::max(ranges [i].end, hi);
343
344	mergeRangesFrom(ranges, i);
345	return;
346	}
347	}
348
349	// CharacterRange comes after all existing ranges.
350	ranges.append(CharacterRange (lo, hi));
351	}
352
353	void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
354	{
355	unsigned next = index + `1`;
356
357	// each iteration of the loop we will either remove something from the list, or break out of the loop.
358	while (next < ranges.size()) {
359	if (ranges [next].begin <= (ranges [index].end + `1`)) {
360	// the next entry now overlaps / concatenates with this one.
361	ranges [index].end = std::max(ranges [index].end, ranges [next].end);
362	ranges.remove(next);
363	} else
364	break;
365	}
366
367	}
368
369	void coalesceTables()
370	{
371	auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
372
373	size_t matchesIndex = `0`;
374	size_t rangesIndex = `0`;
375
376	while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
377	while (matchesIndex < matches.size() && matches [matchesIndex] < ranges [rangesIndex].begin - `1`)
378	matchesIndex++;
379
380	if (matchesIndex < matches.size() && matches [matchesIndex] == ranges [rangesIndex].begin - `1`) {
381	ranges [rangesIndex].begin = matches [matchesIndex];
382	matches.remove(matchesIndex);
383	}
384
385	while (matchesIndex < matches.size() && matches [matchesIndex] < ranges [rangesIndex].end + `1`)
386	matchesIndex++;
387
388	if (matchesIndex < matches.size()) {
389	if (matches [matchesIndex] == ranges [rangesIndex].end + `1`) {
390	ranges [rangesIndex].end = matches [matchesIndex];
391	matches.remove(matchesIndex);
392
393	mergeRangesFrom(ranges, rangesIndex);
394	} else
395	matchesIndex++;
396	}
397	}
398	};
399
400	coalesceMatchesAndRanges(m_matches, m_ranges);
401	coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
402
403	if (!m_matches.size() && !m_matchesUnicode.size()
404	&& m_ranges.size() == `1` && m_rangesUnicode.size() == `1`
405	&& m_ranges [`0`].begin == `0` && m_ranges [`0`].end == `0x7f`
406	&& m_rangesUnicode [`0`].begin == `0x80` && m_rangesUnicode [`0`].end == `0x10ffff`)
407	m_anyCharacter = true;
408	}
409
410	bool hasNonBMPCharacters()
411	{
412	return m_characterWidths & CharacterClassWidths::HasNonBMPChars;
413	}
414
415	CharacterClassWidths characterWidths()
416	{
417	return m_characterWidths;
418	}
419
420	bool anyCharacter()
421	{
422	return m_anyCharacter;
423	}
424
425	bool m_isCaseInsensitive : `1`;
426	bool m_anyCharacter : `1`;
427	CharacterClassWidths m_characterWidths;
428
429	CanonicalMode m_canonicalMode;
430
431	Vector<UChar32> m_matches;
432	Vector<CharacterRange> m_ranges;
433	Vector<UChar32> m_matchesUnicode;
434	Vector<CharacterRange> m_rangesUnicode;
435	};
436
437	class YarrPatternConstructor {
438	public:
439	YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
440	: m_pattern(pattern)
441	, m_characterClassConstructor (pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
442	, m_stackLimit(stackLimit)
443	{
444	auto body = makeUnique<PatternDisjunction>();
445	m_pattern.m_body = body.get();
446	m_alternative = body ->addNewAlternative();
447	m_pattern.m_disjunctions.append(WTFMove(body));
448	}
449
450	~YarrPatternConstructor()
451	{
452	}
453
454	void resetForReparsing()
455	{
456	m_pattern.resetForReparsing();
457	m_characterClassConstructor.reset();
458
459	auto body = makeUnique<PatternDisjunction>();
460	m_pattern.m_body = body.get();
461	m_alternative = body ->addNewAlternative();
462	m_pattern.m_disjunctions.append(WTFMove(body));
463	}
464
465	void saveUnmatchedNamedForwardReferences()
466	{
467	m_unmatchedNamedForwardReferences.shrink(`0`);
468
469	for (auto& entry : m_pattern.m_namedForwardReferences) {
470	if (!m_pattern.m_captureGroupNames.contains(entry))
471	m_unmatchedNamedForwardReferences.append(entry);
472	}
473	}
474
475	void assertionBOL()
476	{
477	if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
478	m_alternative->m_startsWithBOL = true;
479	m_alternative->m_containsBOL = true;
480	m_pattern.m_containsBOL = true;
481	}
482	m_alternative->m_terms.append(PatternTerm::BOL());
483	}
484	void assertionEOL()
485	{
486	m_alternative->m_terms.append(PatternTerm::EOL());
487	}
488	void assertionWordBoundary(bool invert)
489	{
490	m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
491	}
492
493	void atomPatternCharacter(UChar32 ch)
494	{
495	// We handle case-insensitive checking of unicode characters which do have both
496	// cases by handling them as if they were defined using a CharacterClass.
497	if (!m_pattern.ignoreCase() \|\| (isASCII(ch) && !m_pattern.unicode())) {
498	m_alternative->m_terms.append(PatternTerm (ch));
499	return;
500	}
501
502	const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
503	if (info->type == CanonicalizeUnique) {
504	m_alternative->m_terms.append(PatternTerm (ch));
505	return;
506	}
507
508	m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
509	auto newCharacterClass = m_characterClassConstructor.charClass();
510	m_alternative->m_terms.append(PatternTerm (newCharacterClass.get(), false));
511	m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
512	}
513
514	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
515	{
516	switch (classID) {
517	case BuiltInCharacterClassID::DigitClassID:
518	m_alternative->m_terms.append(PatternTerm (m_pattern.digitsCharacterClass(), invert));
519	break;
520	case BuiltInCharacterClassID::SpaceClassID:
521	m_alternative->m_terms.append(PatternTerm (m_pattern.spacesCharacterClass(), invert));
522	break;
523	case BuiltInCharacterClassID::WordClassID:
524	if (m_pattern.unicode() && m_pattern.ignoreCase())
525	m_alternative->m_terms.append(PatternTerm (m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
526	else
527	m_alternative->m_terms.append(PatternTerm (m_pattern.wordcharCharacterClass(), invert));
528	break;
529	case BuiltInCharacterClassID::DotClassID:
530	ASSERT(!invert);
531	if (m_pattern.dotAll())
532	m_alternative->m_terms.append(PatternTerm (m_pattern.anyCharacterClass(), false));
533	else
534	m_alternative->m_terms.append(PatternTerm (m_pattern.newlineCharacterClass(), true));
535	break;
536	default:
537	m_alternative->m_terms.append(PatternTerm (m_pattern.unicodeCharacterClassFor(classID), invert));
538	break;
539	}
540	}
541
542	void atomCharacterClassBegin(bool invert = false)
543	{
544	m_invertCharacterClass = invert;
545	}
546
547	void atomCharacterClassAtom(UChar32 ch)
548	{
549	m_characterClassConstructor.putChar(ch);
550	}
551
552	void atomCharacterClassRange(UChar32 begin, UChar32 end)
553	{
554	m_characterClassConstructor.putRange(begin, end);
555	}
556
557	void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
558	{
559	ASSERT(classID != BuiltInCharacterClassID::DotClassID);
560
561	switch (classID) {
562	case BuiltInCharacterClassID::DigitClassID:
563	m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
564	break;
565
566	case BuiltInCharacterClassID::SpaceClassID:
567	m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
568	break;
569
570	case BuiltInCharacterClassID::WordClassID:
571	if (m_pattern.unicode() && m_pattern.ignoreCase())
572	m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
573	else
574	m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
575	break;
576
577	default:
578	if (!invert)
579	m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID));
580	else
581	m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID));
582	}
583	}
584
585	void atomCharacterClassEnd()
586	{
587	auto newCharacterClass = m_characterClassConstructor.charClass();
588
589	if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
590	m_alternative->m_terms.append(PatternTerm (m_pattern.anyCharacterClass(), false));
591	return;
592	}
593	m_alternative->m_terms.append(PatternTerm (newCharacterClass.get(), m_invertCharacterClass));
594	m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
595	}
596
597	void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> optGroupName = WTF::nullopt)
598	{
599	unsigned subpatternId = m_pattern.m_numSubpatterns + `1`;
600	if (capture) {
601	m_pattern.m_numSubpatterns++;
602	if (optGroupName) {
603	while (m_pattern.m_captureGroupNames.size() < subpatternId)
604	m_pattern.m_captureGroupNames.append(String ());
605	m_pattern.m_captureGroupNames.append(optGroupName.value());
606	m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
607	}
608	} else
609	ASSERT(!optGroupName);
610
611	auto parenthesesDisjunction = makeUnique<PatternDisjunction>(m_alternative);
612	m_alternative->m_terms.append(PatternTerm (PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
613	m_alternative = parenthesesDisjunction ->addNewAlternative();
614	m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
615	}
616
617	void atomParentheticalAssertionBegin(bool invert = false)
618	{
619	auto parenthesesDisjunction = makeUnique<PatternDisjunction>(m_alternative);
620	m_alternative->m_terms.append(PatternTerm (PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + `1`, parenthesesDisjunction.get(), false, invert));
621	m_alternative = parenthesesDisjunction ->addNewAlternative();
622	m_invertParentheticalAssertion = invert;
623	m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
624	}
625
626	void atomParenthesesEnd()
627	{
628	ASSERT(m_alternative->m_parent);
629	ASSERT(m_alternative->m_parent->m_parent);
630
631	PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
632	m_alternative = m_alternative->m_parent->m_parent;
633
634	PatternTerm& lastTerm = m_alternative->lastTerm();
635
636	unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
637	unsigned numBOLAnchoredAlts = `0`;
638
639	for (unsigned i = `0`; i < numParenAlternatives; i++) {
640	// Bubble up BOL flags
641	if (parenthesesDisjunction->m_alternatives [i]->m_startsWithBOL)
642	numBOLAnchoredAlts++;
643	}
644
645	if (numBOLAnchoredAlts) {
646	m_alternative->m_containsBOL = true;
647	// If all the alternatives in parens start with BOL, then so does this one
648	if (numBOLAnchoredAlts == numParenAlternatives)
649	m_alternative->m_startsWithBOL = true;
650	}
651
652	lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
653	m_invertParentheticalAssertion = false;
654	}
655
656	void atomBackReference(unsigned subpatternId)
657	{
658	ASSERT(subpatternId);
659	m_pattern.m_containsBackreferences = true;
660	m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
661
662	if (subpatternId > m_pattern.m_numSubpatterns) {
663	m_alternative->m_terms.append(PatternTerm::ForwardReference());
664	return;
665	}
666
667	PatternAlternative* currentAlternative = m_alternative;
668	ASSERT(currentAlternative);
669
670	// Note to self: if we waited until the AST was baked, we could also remove forwards refs
671	while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
672	PatternTerm& term = currentAlternative->lastTerm();
673	ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) \|\| (term.type == PatternTerm::TypeParentheticalAssertion));
674
675	if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
676	m_alternative->m_terms.append(PatternTerm::ForwardReference());
677	return;
678	}
679	}
680
681	m_alternative->m_terms.append(PatternTerm (subpatternId));
682	}
683
684	void atomNamedBackReference(const String& subpatternName)
685	{
686	ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
687	atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
688	}
689
690	bool isValidNamedForwardReference(const String& subpatternName)
691	{
692	return !m_unmatchedNamedForwardReferences.contains(subpatternName);
693	}
694
695	void atomNamedForwardReference(const String& subpatternName)
696	{
697	m_pattern.m_namedForwardReferences.appendIfNotContains(subpatternName);
698	m_alternative->m_terms.append(PatternTerm::ForwardReference());
699	}
700
701	// deep copy the argument disjunction. If filterStartsWithBOL is true,
702	// skip alternatives with m_startsWithBOL set true.
703	PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
704	{
705	if (UNLIKELY(!isSafeToRecurse())) {
706	m_error = ErrorCode::PatternTooLarge;
707	return `0`;
708	}
709
710	std::unique_ptr<PatternDisjunction> newDisjunction;
711	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
712	PatternAlternative* alternative = disjunction->m_alternatives [alt].get();
713	if (!filterStartsWithBOL \|\| !alternative->m_startsWithBOL) {
714	if (!newDisjunction) {
715	newDisjunction = makeUnique<PatternDisjunction>();
716	newDisjunction ->m_parent = disjunction->m_parent;
717	}
718	PatternAlternative* newAlternative = newDisjunction ->addNewAlternative();
719	newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
720	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i)
721	newAlternative->m_terms.append(copyTerm(alternative->m_terms [i], filterStartsWithBOL));
722	}
723	}
724
725	if (hasError(error())) {
726	newDisjunction = `0`;
727	return `0`;
728	}
729
730	if (!newDisjunction)
731	return `0`;
732
733	PatternDisjunction* copiedDisjunction = newDisjunction.get();
734	m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
735	return copiedDisjunction;
736	}
737
738	PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
739	{
740	if (UNLIKELY(!isSafeToRecurse())) {
741	m_error = ErrorCode::PatternTooLarge;
742	return PatternTerm (term);
743	}
744
745	if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
746	return PatternTerm (term);
747
748	PatternTerm termCopy = term;
749	termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
750	m_pattern.m_hasCopiedParenSubexpressions = true;
751	return termCopy;
752	}
753
754	void quantifyAtom(unsigned min, unsigned max, bool greedy)
755	{
756	ASSERT(min <= max);
757	ASSERT(m_alternative->m_terms.size());
758
759	if (!max) {
760	m_alternative->removeLastTerm();
761	return;
762	}
763
764	PatternTerm& term = m_alternative->lastTerm();
765	ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
766	ASSERT(term.quantityMinCount == `1` && term.quantityMaxCount == `1` && term.quantityType == QuantifierFixedCount);
767
768	if (term.type == PatternTerm::TypeParentheticalAssertion) {
769	// If an assertion is quantified with a minimum count of zero, it can simply be removed.
770	// This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
771	// results in any input being consumed, however the continuation passed to the assertion
772	// (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
773	// reject all zero length matches (see step 2.1). A match from the continuation of the
774	// expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
775	// this is that matches from the assertion are not required, and won't be accepted anyway,
776	// so no need to ever run it.
777	if (!min)
778	m_alternative->removeLastTerm();
779	// We never need to run an assertion more than once. Subsequent interations will be run
780	// with the same start index (since assertions are non-capturing) and the same captures
781	// (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
782	// same result and captures. If the first match succeeds then the subsequent (min - 1)
783	// matches will too. Any additional optional matches will fail (on the same basis as the
784	// minimum zero quantified assertions, above), but this will still result in a match.
785	return;
786	}
787
788	if (min == max)
789	term.quantify(min, max, QuantifierFixedCount);
790	else if (!min \|\| (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
791	term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
792	else {
793	term.quantify(min, min, QuantifierFixedCount);
794	m_alternative->m_terms.append(copyTerm(term));
795	// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
796	m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
797	if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
798	m_alternative->lastTerm().parentheses.isCopy = true;
799	}
800	}
801
802	void disjunction()
803	{
804	m_alternative = m_alternative->m_parent->addNewAlternative();
805	}
806
807	ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
808	{
809	if (UNLIKELY(!isSafeToRecurse()))
810	return ErrorCode::TooManyDisjunctions;
811
812	ErrorCode error = ErrorCode::NoError;
813	alternative->m_hasFixedSize = true;
814	Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
815
816	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i) {
817	PatternTerm& term = alternative->m_terms [i];
818
819	switch (term.type) {
820	case PatternTerm::TypeAssertionBOL:
821	case PatternTerm::TypeAssertionEOL:
822	case PatternTerm::TypeAssertionWordBoundary:
823	term.inputPosition = currentInputPosition.unsafeGet();
824	break;
825
826	case PatternTerm::TypeBackReference:
827	term.inputPosition = currentInputPosition.unsafeGet();
828	term.frameLocation = currentCallFrameSize;
829	currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
830	alternative->m_hasFixedSize = false;
831	break;
832
833	case PatternTerm::TypeForwardReference:
834	break;
835
836	case PatternTerm::TypePatternCharacter:
837	term.inputPosition = currentInputPosition.unsafeGet();
838	if (term.quantityType != QuantifierFixedCount) {
839	term.frameLocation = currentCallFrameSize;
840	currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
841	alternative->m_hasFixedSize = false;
842	} else if (m_pattern.unicode()) {
843	Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
844	tempCount *= U16_LENGTH(term.patternCharacter);
845	if (tempCount.hasOverflowed())
846	return ErrorCode::OffsetTooLarge;
847	currentInputPosition += tempCount;
848	} else
849	currentInputPosition += term.quantityMaxCount;
850	break;
851
852	case PatternTerm::TypeCharacterClass:
853	term.inputPosition = currentInputPosition.unsafeGet();
854	if (term.quantityType != QuantifierFixedCount) {
855	term.frameLocation = currentCallFrameSize;
856	currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
857	alternative->m_hasFixedSize = false;
858	} else if (m_pattern.unicode()) {
859	term.frameLocation = currentCallFrameSize;
860	currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
861	if (term.characterClass->hasOneCharacterSize() && !term.invert()) {
862	Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
863	tempCount *= term.characterClass->hasNonBMPCharacters() ? `2` : `1`;
864	if (tempCount.hasOverflowed())
865	return ErrorCode::OffsetTooLarge;
866	currentInputPosition += tempCount;
867	} else {
868	currentInputPosition += term.quantityMaxCount;
869	alternative->m_hasFixedSize = false;
870	}
871	} else
872	currentInputPosition += term.quantityMaxCount;
873	break;
874
875	case PatternTerm::TypeParenthesesSubpattern:
876	// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
877	term.frameLocation = currentCallFrameSize;
878	if (term.quantityMaxCount == `1` && !term.parentheses.isCopy) {
879	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
880	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
881	if (hasError(error))
882	return error;
883	// If quantity is fixed, then pre-check its minimum size.
884	if (term.quantityType == QuantifierFixedCount)
885	currentInputPosition += term.parentheses.disjunction->m_minimumSize;
886	term.inputPosition = currentInputPosition.unsafeGet();
887	} else if (term.parentheses.isTerminal) {
888	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
889	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
890	if (hasError(error))
891	return error;
892	term.inputPosition = currentInputPosition.unsafeGet();
893	} else {
894	term.inputPosition = currentInputPosition.unsafeGet();
895	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
896	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
897	if (hasError(error))
898	return error;
899	}
900	// Fixed count of 1 could be accepted, if they have a fixed size AND* if all alternatives are of the same length.*
901	alternative->m_hasFixedSize = false;
902	break;
903
904	case PatternTerm::TypeParentheticalAssertion:
905	term.inputPosition = currentInputPosition.unsafeGet();
906	term.frameLocation = currentCallFrameSize;
907	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
908	if (hasError(error))
909	return error;
910	break;
911
912	case PatternTerm::TypeDotStarEnclosure:
913	ASSERT(!m_pattern.m_saveInitialStartValue);
914	alternative->m_hasFixedSize = false;
915	term.inputPosition = initialInputPosition;
916	m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
917	currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
918	m_pattern.m_saveInitialStartValue = true;
919	break;
920	}
921	if (currentInputPosition.hasOverflowed())
922	return ErrorCode::OffsetTooLarge;
923	}
924
925	alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
926	newCallFrameSize = currentCallFrameSize;
927	return error;
928	}
929
930	ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
931	{
932	if (UNLIKELY(!isSafeToRecurse()))
933	return ErrorCode::TooManyDisjunctions;
934
935	if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > `1`))
936	initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
937
938	unsigned minimumInputSize = UINT_MAX;
939	unsigned maximumCallFrameSize = `0`;
940	bool hasFixedSize = true;
941	ErrorCode error = ErrorCode::NoError;
942
943	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
944	PatternAlternative* alternative = disjunction->m_alternatives [alt].get();
945	unsigned currentAlternativeCallFrameSize;
946	error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
947	if (hasError(error))
948	return error;
949	minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
950	maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
951	hasFixedSize &= alternative->m_hasFixedSize;
952	if (alternative->m_minimumSize > INT_MAX)
953	m_pattern.m_containsUnsignedLengthPattern = true;
954	}
955
956	ASSERT(minimumInputSize != UINT_MAX);
957	ASSERT(maximumCallFrameSize >= initialCallFrameSize);
958
959	disjunction->m_hasFixedSize = hasFixedSize;
960	disjunction->m_minimumSize = minimumInputSize;
961	disjunction->m_callFrameSize = maximumCallFrameSize;
962	callFrameSize = maximumCallFrameSize;
963	return error;
964	}
965
966	ErrorCode setupOffsets()
967	{
968	// FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
969	unsigned ignoredCallFrameSize;
970	return setupDisjunctionOffsets(m_pattern.m_body, `0`, `0`, ignoredCallFrameSize);
971	}
972
973	// This optimization identifies sets of parentheses that we will never need to backtrack.
974	// In these cases we do not need to store state from prior iterations.
975	// We can presently avoid backtracking for:
976	// where the parens are at the end of the regular expression (last term in any of the*
977	// alternatives of the main body disjunction).
978	// where the parens are non-capturing, and quantified unbounded greedy ().
979	// where the parens do not contain any capturing subpatterns.*
980	void checkForTerminalParentheses()
981	{
982	// This check is much too crude; should be just checking whether the candidate
983	// node contains nested capturing subpatterns, not the whole expression!
984	if (m_pattern.m_numSubpatterns)
985	return;
986
987	Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
988	for (size_t i = `0`; i < alternatives.size(); ++i) {
989	Vector<PatternTerm>& terms = alternatives [i]->m_terms;
990	if (terms.size()) {
991	PatternTerm& term = terms.last();
992	if (term.type == PatternTerm::TypeParenthesesSubpattern
993	&& term.quantityType == QuantifierGreedy
994	&& term.quantityMinCount == `0`
995	&& term.quantityMaxCount == quantifyInfinite
996	&& !term.capture())
997	term.parentheses.isTerminal = true;
998	}
999	}
1000	}
1001
1002	void optimizeBOL()
1003	{
1004	// Look for expressions containing beginning of line (^) anchoring and unroll them.
1005	// e.g. /^a\|^b\|c/ becomes /^a\|^b\|c/ which is executed once followed by /c/ which loops
1006	// This code relies on the parsing code tagging alternatives with m_containsBOL and
1007	// m_startsWithBOL and rolling those up to containing alternatives.
1008	// At this point, this is only valid for non-multiline expressions.
1009	PatternDisjunction* disjunction = m_pattern.m_body;
1010
1011	if (!m_pattern.m_containsBOL \|\| m_pattern.multiline())
1012	return;
1013
1014	PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
1015
1016	// Set alternatives in disjunction to "onceThrough"
1017	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt)
1018	disjunction->m_alternatives [alt]->setOnceThrough();
1019
1020	if (loopDisjunction) {
1021	// Move alternatives from loopDisjunction to disjunction
1022	for (unsigned alt = `0`; alt < loopDisjunction->m_alternatives.size(); ++alt)
1023	disjunction->m_alternatives.append(loopDisjunction->m_alternatives [alt].release());
1024
1025	loopDisjunction->m_alternatives.clear();
1026	}
1027	}
1028
1029	bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
1030	{
1031	Vector<PatternTerm>& terms = alternative->m_terms;
1032
1033	ASSERT(endIndex <= terms.size());
1034	for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
1035	PatternTerm& term = terms [termIndex];
1036
1037	if (term.m_capture)
1038	return true;
1039
1040	if (term.type == PatternTerm::TypeParenthesesSubpattern) {
1041	PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
1042	for (unsigned alt = `0`; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
1043	if (containsCapturingTerms(nestedDisjunction->m_alternatives [alt].get(), `0`, nestedDisjunction->m_alternatives [alt]->m_terms.size()))
1044	return true;
1045	}
1046	}
1047	}
1048
1049	return false;
1050	}
1051
1052	// This optimization identifies alternatives in the form of
1053	// [^].[?]<expression>.[$] for expressions that don't have any
1054	// capturing terms. The alternative is changed to <expression>
1055	// followed by processing of the dot stars to find and adjust the
1056	// beginning and the end of the match.
1057	void optimizeDotStarWrappedExpressions()
1058	{
1059	Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
1060	if (alternatives.size() != `1`)
1061	return;
1062
1063	CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
1064	PatternAlternative* alternative = alternatives [`0`].get();
1065	Vector<PatternTerm>& terms = alternative->m_terms;
1066	if (terms.size() >= `3`) {
1067	bool startsWithBOL = false;
1068	bool endsWithEOL = false;
1069	size_t termIndex, firstExpressionTerm;
1070
1071	termIndex = `0`;
1072	if (terms [termIndex].type == PatternTerm::TypeAssertionBOL) {
1073	startsWithBOL = true;
1074	++termIndex;
1075	}
1076
1077	PatternTerm& firstNonAnchorTerm = terms [termIndex];
1078	if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1079	\|\| firstNonAnchorTerm.characterClass != dotCharacterClass
1080	\|\| firstNonAnchorTerm.quantityMinCount
1081	\|\| firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1082	return;
1083
1084	firstExpressionTerm = termIndex + `1`;
1085
1086	termIndex = terms.size() - `1`;
1087	if (terms [termIndex].type == PatternTerm::TypeAssertionEOL) {
1088	endsWithEOL = true;
1089	--termIndex;
1090	}
1091
1092	PatternTerm& lastNonAnchorTerm = terms [termIndex];
1093	if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1094	\|\| lastNonAnchorTerm.characterClass != dotCharacterClass
1095	\|\| lastNonAnchorTerm.quantityType != QuantifierGreedy
1096	\|\| lastNonAnchorTerm.quantityMinCount
1097	\|\| lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1098	return;
1099
1100	size_t endIndex = termIndex;
1101	if (firstExpressionTerm >= endIndex)
1102	return;
1103
1104	if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
1105	for (termIndex = terms.size() - `1`; termIndex >= endIndex; --termIndex)
1106	terms.remove(termIndex);
1107
1108	for (termIndex = firstExpressionTerm; termIndex > `0`; --termIndex)
1109	terms.remove(termIndex - `1`);
1110
1111	terms.append(PatternTerm (startsWithBOL, endsWithEOL));
1112
1113	m_pattern.m_containsBOL = false;
1114	}
1115	}
1116	}
1117
1118	ErrorCode error() { return m_error; }
1119
1120	private:
1121	bool isSafeToRecurse() const
1122	{
1123	if (!m_stackLimit)
1124	return true;
1125	int8_t* curr = reinterpret_cast<int8_t*>(currentStackPointer());
1126	int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
1127	return curr >= limit;
1128	}
1129
1130	YarrPattern& m_pattern;
1131	PatternAlternative* m_alternative;
1132	CharacterClassConstructor m_characterClassConstructor;
1133	Vector<String> m_unmatchedNamedForwardReferences;
1134	void* m_stackLimit;
1135	ErrorCode m_error { ErrorCode::NoError };
1136	bool m_invertCharacterClass;
1137	bool m_invertParentheticalAssertion { false };
1138	};
1139
1140	ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit)
1141	{
1142	YarrPatternConstructor constructor(*this, stackLimit);
1143
1144	{
1145	ErrorCode error = parse(constructor, patternString, unicode());
1146	if (hasError(error))
1147	return error;
1148	}
1149
1150	// If the pattern contains illegal backreferences reset & reparse.
1151	// Quoting Netscape's "What's new in JavaScript 1.2",
1152	// "Note: if the number of left parentheses is less than the number specified
1153	// in \#, the \# is taken as an octal escape as described in the next row."
1154	if (containsIllegalBackReference() \|\| containsIllegalNamedForwardReferences()) {
1155	if (unicode())
1156	return ErrorCode::InvalidBackreference;
1157
1158	unsigned numSubpatterns = m_numSubpatterns;
1159
1160	constructor.saveUnmatchedNamedForwardReferences();
1161	constructor.resetForReparsing();
1162	ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns);
1163	ASSERT_UNUSED(error, !hasError(error));
1164	ASSERT(numSubpatterns == m_numSubpatterns);
1165	}
1166
1167	constructor.checkForTerminalParentheses();
1168	constructor.optimizeDotStarWrappedExpressions();
1169	constructor.optimizeBOL();
1170
1171	if (hasError(constructor.error()))
1172	return constructor.error();
1173
1174	{
1175	ErrorCode error = constructor.setupOffsets();
1176	if (hasError(error))
1177	return error;
1178	}
1179
1180	if (Options::dumpCompiledRegExpPatterns())
1181	dumpPattern(patternString);
1182
1183	return ErrorCode::NoError;
1184	}
1185
1186	YarrPattern::YarrPattern(const String& pattern, OptionSet<Flags> flags, ErrorCode& error, void* stackLimit)
1187	: m_containsBackreferences(false)
1188	, m_containsBOL(false)
1189	, m_containsUnsignedLengthPattern(false)
1190	, m_hasCopiedParenSubexpressions(false)
1191	, m_saveInitialStartValue(false)
1192	, m_flags (flags)
1193	{
1194	ASSERT(m_flags != Flags::DeletedValue);
1195	error = compile(pattern, stackLimit);
1196	}
1197
1198	void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1199	{
1200	out.print(" ");
1201	for (; nestingDepth; --nestingDepth)
1202	out.print(" ");
1203	}
1204
1205	void dumpUChar32(PrintStream& out, UChar32 c)
1206	{
1207	if (c >= `' '`&& c <= `0xff`)
1208	out.printf("'%c'", static_cast<char>(c));
1209	else
1210	out.printf("0x%04x", c);
1211	}
1212
1213	void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
1214	{
1215	if (characterClass == pattern->anyCharacterClass())
1216	out.print("<any character>");
1217	else if (characterClass == pattern->newlineCharacterClass())
1218	out.print("<newline>");
1219	else if (characterClass == pattern->digitsCharacterClass())
1220	out.print("<digits>");
1221	else if (characterClass == pattern->spacesCharacterClass())
1222	out.print("<whitespace>");
1223	else if (characterClass == pattern->wordcharCharacterClass())
1224	out.print("<word>");
1225	else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
1226	out.print("<unicode word ignore case>");
1227	else if (characterClass == pattern->nondigitsCharacterClass())
1228	out.print("<non-digits>");
1229	else if (characterClass == pattern->nonspacesCharacterClass())
1230	out.print("<non-whitespace>");
1231	else if (characterClass == pattern->nonwordcharCharacterClass())
1232	out.print("<non-word>");
1233	else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1234	out.print("<unicode non-word ignore case>");
1235	else {
1236	bool needMatchesRangesSeperator = false;
1237
1238	auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1239	size_t matchesSize = matches.size();
1240	if (matchesSize) {
1241	if (needMatchesRangesSeperator)
1242	out.print(",");
1243	needMatchesRangesSeperator = true;
1244
1245	out.print(prefix, ":(");
1246	for (size_t i = `0`; i < matchesSize; ++i) {
1247	if (i)
1248	out.print(",");
1249	dumpUChar32(out, matches [i]);
1250	}
1251	out.print(")");
1252	}
1253	};
1254
1255	auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1256	size_t rangeSize = ranges.size();
1257	if (rangeSize) {
1258	if (needMatchesRangesSeperator)
1259	out.print(",");
1260	needMatchesRangesSeperator = true;
1261
1262	out.print(prefix, " ranges:(");
1263	for (size_t i = `0`; i < rangeSize; ++i) {
1264	if (i)
1265	out.print(",");
1266	CharacterRange range = ranges [i];
1267	out.print("(");
1268	dumpUChar32(out, range.begin);
1269	out.print("..");
1270	dumpUChar32(out, range.end);
1271	out.print(")");
1272	}
1273	out.print(")");
1274	}
1275	};
1276
1277	out.print("[");
1278	dumpMatches ("ASCII", characterClass->m_matches);
1279	dumpRanges ("ASCII", characterClass->m_ranges);
1280	dumpMatches ("Unicode", characterClass->m_matchesUnicode);
1281	dumpRanges ("Unicode", characterClass->m_rangesUnicode);
1282	out.print("]");
1283	}
1284	}
1285
1286	void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1287	{
1288	out.print("minimum size: ", m_minimumSize);
1289	if (m_hasFixedSize)
1290	out.print(",fixed size");
1291	if (m_onceThrough)
1292	out.print(",once through");
1293	if (m_startsWithBOL)
1294	out.print(",starts with ^");
1295	if (m_containsBOL)
1296	out.print(",contains ^");
1297	out.print("\n");
1298
1299	for (size_t i = `0`; i < m_terms.size(); ++i)
1300	m_terms [i].dump(out, thisPattern, nestingDepth);
1301	}
1302
1303	void PatternTerm::dumpQuantifier(PrintStream& out)
1304	{
1305	if (quantityType == QuantifierFixedCount && quantityMinCount == `1` && quantityMaxCount == `1`)
1306	return;
1307	out.print(" {", quantityMinCount.unsafeGet());
1308	if (quantityMinCount != quantityMaxCount) {
1309	if (quantityMaxCount == UINT_MAX)
1310	out.print(",...");
1311	else
1312	out.print(",", quantityMaxCount.unsafeGet());
1313	}
1314	out.print("}");
1315	if (quantityType == QuantifierGreedy)
1316	out.print(" greedy");
1317	else if (quantityType == QuantifierNonGreedy)
1318	out.print(" non-greedy");
1319	}
1320
1321	void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1322	{
1323	indentForNestingLevel(out, nestingDepth);
1324
1325	if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
1326	if (invert())
1327	out.print("not ");
1328	}
1329
1330	switch (type) {
1331	case TypeAssertionBOL:
1332	out.println("BOL");
1333	break;
1334	case TypeAssertionEOL:
1335	out.println("EOL");
1336	break;
1337	case TypeAssertionWordBoundary:
1338	out.println("word boundary");
1339	break;
1340	case TypePatternCharacter:
1341	out.printf("character ");
1342	out.printf("inputPosition %u ", inputPosition);
1343	if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1344	dumpUChar32(out, toASCIIUpper(patternCharacter));
1345	out.print("/");
1346	dumpUChar32(out, toASCIILower(patternCharacter));
1347	} else
1348	dumpUChar32(out, patternCharacter);
1349	dumpQuantifier(out);
1350	if (quantityType != QuantifierFixedCount)
1351	out.print(",frame location ", frameLocation);
1352	out.println();
1353	break;
1354	case TypeCharacterClass:
1355	out.print("character class ");
1356	out.printf("inputPosition %u ", inputPosition);
1357	dumpCharacterClass(out, thisPattern, characterClass);
1358	dumpQuantifier(out);
1359	if (quantityType != QuantifierFixedCount \|\| thisPattern->unicode())
1360	out.print(",frame location ", frameLocation);
1361	out.println();
1362	break;
1363	case TypeBackReference:
1364	out.print("back reference to subpattern #", backReferenceSubpatternId);
1365	out.println(",frame location ", frameLocation);
1366	break;
1367	case TypeForwardReference:
1368	out.println("forward reference");
1369	break;
1370	case TypeParenthesesSubpattern:
1371	if (m_capture)
1372	out.print("captured ");
1373	else
1374	out.print("non-captured ");
1375
1376	FALLTHROUGH;
1377	case TypeParentheticalAssertion:
1378	if (m_invert)
1379	out.print("inverted ");
1380
1381	if (type == TypeParenthesesSubpattern)
1382	out.print("subpattern");
1383	else if (type == TypeParentheticalAssertion)
1384	out.print("assertion");
1385
1386	if (m_capture)
1387	out.print(" #", parentheses.subpatternId);
1388
1389	dumpQuantifier(out);
1390
1391	if (parentheses.isCopy)
1392	out.print(",copy");
1393
1394	if (parentheses.isTerminal)
1395	out.print(",terminal");
1396
1397	out.println(",frame location ", frameLocation);
1398
1399	if (parentheses.disjunction->m_alternatives.size() > `1`) {
1400	indentForNestingLevel(out, nestingDepth + `1`);
1401	unsigned alternativeFrameLocation = frameLocation;
1402	if (quantityMaxCount == `1` && !parentheses.isCopy)
1403	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1404	else if (parentheses.isTerminal)
1405	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1406	else
1407	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
1408	out.println("alternative list,frame location ", alternativeFrameLocation);
1409	}
1410
1411	parentheses.disjunction->dump(out, thisPattern, nestingDepth + `1`);
1412	break;
1413	case TypeDotStarEnclosure:
1414	out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1415	break;
1416	}
1417	}
1418
1419	void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = `0`)
1420	{
1421	unsigned alternativeCount = m_alternatives.size();
1422	for (unsigned i = `0`; i < alternativeCount; ++i) {
1423	indentForNestingLevel(out, nestingDepth);
1424	if (alternativeCount > `1`)
1425	out.print("alternative #", i, ": ");
1426	m_alternatives [i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > `1`));
1427	}
1428	}
1429
1430	void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString)
1431	{
1432	out.print("/", patternString, "/");
1433
1434	if (global())
1435	out.print("g");
1436	if (ignoreCase())
1437	out.print("i");
1438	if (multiline())
1439	out.print("m");
1440	if (unicode())
1441	out.print("u");
1442	if (sticky())
1443	out.print("y");
1444	}
1445
1446	void YarrPattern::dumpPattern(const String& patternString)
1447	{
1448	dumpPattern(WTF::dataFile(), patternString);
1449	}
1450
1451	void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1452	{
1453	out.print("RegExp pattern for ");
1454	dumpPatternString(out, patternString);
1455
1456	if (m_flags) {
1457	bool printSeparator = false;
1458	out.print(" (");
1459	if (global()) {
1460	out.print("global");
1461	printSeparator = true;
1462	}
1463	if (ignoreCase()) {
1464	if (printSeparator)
1465	out.print("\|");
1466	out.print("ignore case");
1467	printSeparator = true;
1468	}
1469	if (multiline()) {
1470	if (printSeparator)
1471	out.print("\|");
1472	out.print("multiline");
1473	printSeparator = true;
1474	}
1475	if (unicode()) {
1476	if (printSeparator)
1477	out.print("\|");
1478	out.print("unicode");
1479	printSeparator = true;
1480	}
1481	if (sticky()) {
1482	if (printSeparator)
1483	out.print("\|");
1484	out.print("sticky");
1485	}
1486	out.print(")");
1487	}
1488	out.print(":\n");
1489	if (m_body->m_callFrameSize)
1490	out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
1491	m_body->dump(out, this);
1492	}
1493
1494	std::unique_ptr<CharacterClass> anycharCreate()
1495	{
1496	auto characterClass = makeUnique<CharacterClass>();
1497	characterClass ->m_ranges.append(CharacterRange (`0x00`, `0x7f`));
1498	characterClass ->m_rangesUnicode.append(CharacterRange (`0x0080`, `0x10ffff`));
1499	characterClass ->m_characterWidths = CharacterClassWidths::HasBothBMPAndNonBMP;
1500	characterClass ->m_anyCharacter = true;
1501	return characterClass;
1502	}
1503
1504	} } // namespace JSC::Yarr
1505

Browse the source code of jsc/Source/JavaScriptCore/yarr/YarrPattern.cpp