YarrPattern.cpp source code [webcore/Source/JavaScriptCore/yarr/YarrPattern.cpp]

1	/*
2	* Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3	* Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#include "config.h"
28	#include "YarrPattern.h"
29
30	#include "Options.h"
31	#include "Yarr.h"
32	#include "YarrCanonicalize.h"
33	#include "YarrParser.h"
34	#include <wtf/DataLog.h>
35	#include <wtf/Optional.h>
36	#include <wtf/StackPointer.h>
37	#include <wtf/Threading.h>
38	#include <wtf/Vector.h>
39
40	namespace JSC { namespace Yarr {
41
42	#include "RegExpJitTables.h"
43
44	class CharacterClassConstructor {
45	public:
46	CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
47	: m_isCaseInsensitive(isCaseInsensitive)
48	, m_anyCharacter(false)
49	, m_characterWidths(CharacterClassWidths::Unknown)
50	, m_canonicalMode(canonicalMode)
51	{
52	}
53
54	void reset()
55	{
56	m_matches.clear();
57	m_ranges.clear();
58	m_matchesUnicode.clear();
59	m_rangesUnicode.clear();
60	m_anyCharacter = false;
61	m_characterWidths = CharacterClassWidths::Unknown;
62	}
63
64	void append(const CharacterClass* other)
65	{
66	for (size_t i = `0`; i < other->m_matches.size(); ++i)
67	addSorted(m_matches, other->m_matches [i]);
68	for (size_t i = `0`; i < other->m_ranges.size(); ++i)
69	addSortedRange(m_ranges, other->m_ranges [i].begin, other->m_ranges [i].end);
70	for (size_t i = `0`; i < other->m_matchesUnicode.size(); ++i)
71	addSorted(m_matchesUnicode, other->m_matchesUnicode [i]);
72	for (size_t i = `0`; i < other->m_rangesUnicode.size(); ++i)
73	addSortedRange(m_rangesUnicode, other->m_rangesUnicode [i].begin, other->m_rangesUnicode [i].end);
74	}
75
76	void appendInverted(const CharacterClass* other)
77	{
78	auto addSortedInverted = [&](UChar32 min, UChar32 max,
79	const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
80	Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
81
82	auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
83	if (lo < hiPlusOne) {
84	if (lo + `1` == hiPlusOne)
85	addSorted(destMatches, lo);
86	else
87	addSortedRange(destRanges, lo, hiPlusOne - `1`);
88	}
89	};
90
91	UChar32 lo = min;
92	size_t matchesIndex = `0`;
93	size_t rangesIndex = `0`;
94	bool matchesRemaining = matchesIndex < srcMatches.size();
95	bool rangesRemaining = rangesIndex < srcRanges.size();
96
97	if (!matchesRemaining && !rangesRemaining) {
98	addSortedMatchOrRange(min, max + `1`);
99	return;
100	}
101
102	while (matchesRemaining \|\| rangesRemaining) {
103	UChar32 hiPlusOne;
104	UChar32 nextLo;
105
106	if (matchesRemaining
107	&& (!rangesRemaining \|\| srcMatches [matchesIndex] < srcRanges [rangesIndex].begin)) {
108	hiPlusOne = srcMatches [matchesIndex];
109	nextLo = hiPlusOne + `1`;
110	++matchesIndex;
111	matchesRemaining = matchesIndex < srcMatches.size();
112	} else {
113	hiPlusOne = srcRanges [rangesIndex].begin;
114	nextLo = srcRanges [rangesIndex].end + `1`;
115	++rangesIndex;
116	rangesRemaining = rangesIndex < srcRanges.size();
117	}
118
119	addSortedMatchOrRange(lo, hiPlusOne);
120
121	lo = nextLo;
122	}
123
124	addSortedMatchOrRange(lo, max + `1`);
125	};
126
127	addSortedInverted(`0`, `0x7f`, other->m_matches, other->m_ranges, m_matches, m_ranges);
128	addSortedInverted(`0x80`, `0x10ffff`, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
129	}
130
131	void putChar(UChar32 ch)
132	{
133	if (!m_isCaseInsensitive) {
134	addSorted(ch);
135	return;
136	}
137
138	if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
139	// Handle ASCII cases.
140	if (isASCIIAlpha(ch)) {
141	addSorted(m_matches, toASCIIUpper(ch));
142	addSorted(m_matches, toASCIILower(ch));
143	} else
144	addSorted(m_matches, ch);
145	return;
146	}
147
148	// Add multiple matches, if necessary.
149	const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
150	if (info->type == CanonicalizeUnique)
151	addSorted(ch);
152	else
153	putUnicodeIgnoreCase(ch, info);
154	}
155
156	void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
157	{
158	ASSERT(m_isCaseInsensitive);
159	ASSERT(ch >= info->begin && ch <= info->end);
160	ASSERT(info->type != CanonicalizeUnique);
161	if (info->type == CanonicalizeSet) {
162	for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
163	addSorted(ch);
164	} else {
165	addSorted(ch);
166	addSorted(getCanonicalPair(info, ch));
167	}
168	}
169
170	void putRange(UChar32 lo, UChar32 hi)
171	{
172	if (isASCII(lo)) {
173	char asciiLo = lo;
174	char asciiHi = std::min(hi, (UChar32)`0x7f`);
175	addSortedRange(m_ranges, lo, asciiHi);
176
177	if (m_isCaseInsensitive) {
178	if ((asciiLo <= `'Z'`) && (asciiHi >= `'A'`))
179	addSortedRange(m_ranges, std::max(asciiLo, `'A'`)+(`'a'`-`'A'`), std::min(asciiHi, `'Z'`)+(`'a'`-`'A'`));
180	if ((asciiLo <= `'z'`) && (asciiHi >= `'a'`))
181	addSortedRange(m_ranges, std::max(asciiLo, `'a'`)+(`'A'`-`'a'`), std::min(asciiHi, `'z'`)+(`'A'`-`'a'`));
182	}
183	}
184	if (isASCII(hi))
185	return;
186
187	lo = std::max(lo, (UChar32)`0x80`);
188	addSortedRange(m_rangesUnicode, lo, hi);
189
190	if (!m_isCaseInsensitive)
191	return;
192
193	const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
194	while (true) {
195	// Handle the range [lo .. end]
196	UChar32 end = std::min<UChar32>(info->end, hi);
197
198	switch (info->type) {
199	case CanonicalizeUnique:
200	// Nothing to do - no canonical equivalents.
201	break;
202	case CanonicalizeSet: {
203	UChar ch;
204	for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
205	addSorted(m_matchesUnicode, ch);
206	break;
207	}
208	case CanonicalizeRangeLo:
209	addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
210	break;
211	case CanonicalizeRangeHi:
212	addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
213	break;
214	case CanonicalizeAlternatingAligned:
215	// Use addSortedRange since there is likely an abutting range to combine with.
216	if (lo & `1`)
217	addSortedRange(m_rangesUnicode, lo - `1`, lo - `1`);
218	if (!(end & `1`))
219	addSortedRange(m_rangesUnicode, end + `1`, end + `1`);
220	break;
221	case CanonicalizeAlternatingUnaligned:
222	// Use addSortedRange since there is likely an abutting range to combine with.
223	if (!(lo & `1`))
224	addSortedRange(m_rangesUnicode, lo - `1`, lo - `1`);
225	if (end & `1`)
226	addSortedRange(m_rangesUnicode, end + `1`, end + `1`);
227	break;
228	}
229
230	if (hi == end)
231	return;
232
233	++info;
234	lo = info->begin;
235	};
236
237	}
238
239	std::unique_ptr<CharacterClass> charClass()
240	{
241	coalesceTables();
242
243	auto characterClass = std::make_unique<CharacterClass>();
244
245	characterClass ->m_matches.swap(m_matches);
246	characterClass ->m_ranges.swap(m_ranges);
247	characterClass ->m_matchesUnicode.swap(m_matchesUnicode);
248	characterClass ->m_rangesUnicode.swap(m_rangesUnicode);
249	characterClass ->m_anyCharacter = anyCharacter();
250	characterClass ->m_characterWidths = characterWidths();
251
252	m_anyCharacter = false;
253	m_characterWidths = CharacterClassWidths::Unknown;
254
255	return characterClass;
256	}
257
258	private:
259	void addSorted(UChar32 ch)
260	{
261	addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
262	}
263
264	void addSorted(Vector<UChar32>& matches, UChar32 ch)
265	{
266	unsigned pos = `0`;
267	unsigned range = matches.size();
268
269	m_characterWidths \|= (U_IS_BMP(ch) ? CharacterClassWidths::HasBMPChars : CharacterClassWidths::HasNonBMPChars);
270
271	// binary chop, find position to insert char.
272	while (range) {
273	unsigned index = range >> `1`;
274
275	int val = matches [pos+index] - ch;
276	if (!val)
277	return;
278	else if (val > `0`) {
279	if (val == `1`) {
280	UChar32 lo = ch;
281	UChar32 hi = ch + `1`;
282	matches.remove(pos + index);
283	if (pos + index > `0` && matches [pos + index - `1`] == ch - `1`) {
284	lo = ch - `1`;
285	matches.remove(pos + index - `1`);
286	}
287	addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
288	return;
289	}
290	range = index;
291	} else {
292	if (val == -`1`) {
293	UChar32 lo = ch - `1`;
294	UChar32 hi = ch;
295	matches.remove(pos + index);
296	if (pos + index + `1` < matches.size() && matches [pos + index + `1`] == ch + `1`) {
297	hi = ch + `1`;
298	matches.remove(pos + index + `1`);
299	}
300	addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
301	return;
302	}
303	pos += (index+`1`);
304	range -= (index+`1`);
305	}
306	}
307
308	if (pos == matches.size())
309	matches.append(ch);
310	else
311	matches.insert(pos, ch);
312	}
313
314	void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
315	{
316	size_t end = ranges.size();
317
318	if (U_IS_BMP(lo))
319	m_characterWidths \|= CharacterClassWidths::HasBMPChars;
320	if (!U_IS_BMP(hi))
321	m_characterWidths \|= CharacterClassWidths::HasNonBMPChars;
322
323	// Simple linear scan - I doubt there are that many ranges anyway...
324	// feel free to fix this with something faster (eg binary chop).
325	for (size_t i = `0`; i < end; ++i) {
326	// does the new range fall before the current position in the array
327	if (hi < ranges [i].begin) {
328	// Concatenate appending ranges.
329	if (hi == (ranges [i].begin - `1`)) {
330	ranges [i].begin = lo;
331	return;
332	}
333	ranges.insert(i, CharacterRange (lo, hi));
334	return;
335	}
336	// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
337	// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
338	// end of the last range they concatenate, which is just as good.
339	if (lo <= (ranges [i].end + `1`)) {
340	// found an intersect! we'll replace this entry in the array.
341	ranges [i].begin = std::min(ranges [i].begin, lo);
342	ranges [i].end = std::max(ranges [i].end, hi);
343
344	mergeRangesFrom(ranges, i);
345	return;
346	}
347	}
348
349	// CharacterRange comes after all existing ranges.
350	ranges.append(CharacterRange (lo, hi));
351	}
352
353	void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
354	{
355	unsigned next = index + `1`;
356
357	// each iteration of the loop we will either remove something from the list, or break out of the loop.
358	while (next < ranges.size()) {
359	if (ranges [next].begin <= (ranges [index].end + `1`)) {
360	// the next entry now overlaps / concatenates with this one.
361	ranges [index].end = std::max(ranges [index].end, ranges [next].end);
362	ranges.remove(next);
363	} else
364	break;
365	}
366
367	}
368
369	void coalesceTables()
370	{
371	auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
372
373	size_t matchesIndex = `0`;
374	size_t rangesIndex = `0`;
375
376	while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
377	while (matchesIndex < matches.size() && matches [matchesIndex] < ranges [rangesIndex].begin - `1`)
378	matchesIndex++;
379
380	if (matchesIndex < matches.size() && matches [matchesIndex] == ranges [rangesIndex].begin - `1`) {
381	ranges [rangesIndex].begin = matches [matchesIndex];
382	matches.remove(matchesIndex);
383	}
384
385	while (matchesIndex < matches.size() && matches [matchesIndex] < ranges [rangesIndex].end + `1`)
386	matchesIndex++;
387
388	if (matchesIndex < matches.size()) {
389	if (matches [matchesIndex] == ranges [rangesIndex].end + `1`) {
390	ranges [rangesIndex].end = matches [matchesIndex];
391	matches.remove(matchesIndex);
392
393	mergeRangesFrom(ranges, rangesIndex);
394	} else
395	matchesIndex++;
396	}
397	}
398	};
399
400	coalesceMatchesAndRanges(m_matches, m_ranges);
401	coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
402
403	if (!m_matches.size() && !m_matchesUnicode.size()
404	&& m_ranges.size() == `1` && m_rangesUnicode.size() == `1`
405	&& m_ranges [`0`].begin == `0` && m_ranges [`0`].end == `0x7f`
406	&& m_rangesUnicode [`0`].begin == `0x80` && m_rangesUnicode [`0`].end == `0x10ffff`)
407	m_anyCharacter = true;
408	}
409
410	bool hasNonBMPCharacters()
411	{
412	return m_characterWidths & CharacterClassWidths::HasNonBMPChars;
413	}
414
415	CharacterClassWidths characterWidths()
416	{
417	return m_characterWidths;
418	}
419
420	bool anyCharacter()
421	{
422	return m_anyCharacter;
423	}
424
425	bool m_isCaseInsensitive : `1`;
426	bool m_anyCharacter : `1`;
427	CharacterClassWidths m_characterWidths;
428
429	CanonicalMode m_canonicalMode;
430
431	Vector<UChar32> m_matches;
432	Vector<CharacterRange> m_ranges;
433	Vector<UChar32> m_matchesUnicode;
434	Vector<CharacterRange> m_rangesUnicode;
435	};
436
437	class YarrPatternConstructor {
438	public:
439	YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
440	: m_pattern(pattern)
441	, m_characterClassConstructor (pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
442	, m_stackLimit(stackLimit)
443	{
444	auto body = std::make_unique<PatternDisjunction>();
445	m_pattern.m_body = body.get();
446	m_alternative = body ->addNewAlternative();
447	m_pattern.m_disjunctions.append(WTFMove(body));
448	}
449
450	~YarrPatternConstructor()
451	{
452	}
453
454	void resetForReparsing()
455	{
456	m_pattern.resetForReparsing();
457	m_characterClassConstructor.reset();
458
459	auto body = std::make_unique<PatternDisjunction>();
460	m_pattern.m_body = body.get();
461	m_alternative = body ->addNewAlternative();
462	m_pattern.m_disjunctions.append(WTFMove(body));
463	}
464
465	void saveUnmatchedNamedForwardReferences()
466	{
467	m_unmatchedNamedForwardReferences.shrink(`0`);
468
469	for (auto& entry : m_pattern.m_namedForwardReferences) {
470	if (!m_pattern.m_captureGroupNames.contains(entry))
471	m_unmatchedNamedForwardReferences.append(entry);
472	}
473	}
474
475	void assertionBOL()
476	{
477	if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
478	m_alternative->m_startsWithBOL = true;
479	m_alternative->m_containsBOL = true;
480	m_pattern.m_containsBOL = true;
481	}
482	m_alternative->m_terms.append(PatternTerm::BOL());
483	}
484	void assertionEOL()
485	{
486	m_alternative->m_terms.append(PatternTerm::EOL());
487	}
488	void assertionWordBoundary(bool invert)
489	{
490	m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
491	}
492
493	void atomPatternCharacter(UChar32 ch)
494	{
495	// We handle case-insensitive checking of unicode characters which do have both
496	// cases by handling them as if they were defined using a CharacterClass.
497	if (!m_pattern.ignoreCase() \|\| (isASCII(ch) && !m_pattern.unicode())) {
498	m_alternative->m_terms.append(PatternTerm (ch));
499	return;
500	}
501
502	const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
503	if (info->type == CanonicalizeUnique) {
504	m_alternative->m_terms.append(PatternTerm (ch));
505	return;
506	}
507
508	m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
509	auto newCharacterClass = m_characterClassConstructor.charClass();
510	m_alternative->m_terms.append(PatternTerm (newCharacterClass.get(), false));
511	m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
512	}
513
514	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
515	{
516	switch (classID) {
517	case BuiltInCharacterClassID::DigitClassID:
518	m_alternative->m_terms.append(PatternTerm (m_pattern.digitsCharacterClass(), invert));
519	break;
520	case BuiltInCharacterClassID::SpaceClassID:
521	m_alternative->m_terms.append(PatternTerm (m_pattern.spacesCharacterClass(), invert));
522	break;
523	case BuiltInCharacterClassID::WordClassID:
524	if (m_pattern.unicode() && m_pattern.ignoreCase())
525	m_alternative->m_terms.append(PatternTerm (m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
526	else
527	m_alternative->m_terms.append(PatternTerm (m_pattern.wordcharCharacterClass(), invert));
528	break;
529	case BuiltInCharacterClassID::DotClassID:
530	ASSERT(!invert);
531	if (m_pattern.dotAll())
532	m_alternative->m_terms.append(PatternTerm (m_pattern.anyCharacterClass(), false));
533	else
534	m_alternative->m_terms.append(PatternTerm (m_pattern.newlineCharacterClass(), true));
535	break;
536	default:
537	m_alternative->m_terms.append(PatternTerm (m_pattern.unicodeCharacterClassFor(classID), invert));
538	break;
539	}
540	}
541
542	void atomCharacterClassBegin(bool invert = false)
543	{
544	m_invertCharacterClass = invert;
545	}
546
547	void atomCharacterClassAtom(UChar32 ch)
548	{
549	m_characterClassConstructor.putChar(ch);
550	}
551
552	void atomCharacterClassRange(UChar32 begin, UChar32 end)
553	{
554	m_characterClassConstructor.putRange(begin, end);
555	}
556
557	void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
558	{
559	ASSERT(classID != BuiltInCharacterClassID::DotClassID);
560
561	switch (classID) {
562	case BuiltInCharacterClassID::DigitClassID:
563	m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
564	break;
565
566	case BuiltInCharacterClassID::SpaceClassID:
567	m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
568	break;
569
570	case BuiltInCharacterClassID::WordClassID:
571	if (m_pattern.unicode() && m_pattern.ignoreCase())
572	m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
573	else
574	m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
575	break;
576
577	default:
578	if (!invert)
579	m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID));
580	else
581	m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID));
582	}
583	}
584
585	void atomCharacterClassEnd()
586	{
587	auto newCharacterClass = m_characterClassConstructor.charClass();
588
589	if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
590	m_alternative->m_terms.append(PatternTerm (m_pattern.anyCharacterClass(), false));
591	return;
592	}
593	m_alternative->m_terms.append(PatternTerm (newCharacterClass.get(), m_invertCharacterClass));
594	m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
595	}
596
597	void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> optGroupName = WTF::nullopt)
598	{
599	unsigned subpatternId = m_pattern.m_numSubpatterns + `1`;
600	if (capture) {
601	m_pattern.m_numSubpatterns++;
602	if (optGroupName) {
603	while (m_pattern.m_captureGroupNames.size() < subpatternId)
604	m_pattern.m_captureGroupNames.append(String ());
605	m_pattern.m_captureGroupNames.append(optGroupName.value());
606	m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
607	}
608	} else
609	ASSERT(!optGroupName);
610
611	auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
612	m_alternative->m_terms.append(PatternTerm (PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
613	m_alternative = parenthesesDisjunction ->addNewAlternative();
614	m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
615	}
616
617	void atomParentheticalAssertionBegin(bool invert = false)
618	{
619	auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
620	m_alternative->m_terms.append(PatternTerm (PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + `1`, parenthesesDisjunction.get(), false, invert));
621	m_alternative = parenthesesDisjunction ->addNewAlternative();
622	m_invertParentheticalAssertion = invert;
623	m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
624	}
625
626	void atomParenthesesEnd()
627	{
628	ASSERT(m_alternative->m_parent);
629	ASSERT(m_alternative->m_parent->m_parent);
630
631	PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
632	m_alternative = m_alternative->m_parent->m_parent;
633
634	PatternTerm& lastTerm = m_alternative->lastTerm();
635
636	unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
637	unsigned numBOLAnchoredAlts = `0`;
638
639	for (unsigned i = `0`; i < numParenAlternatives; i++) {
640	// Bubble up BOL flags
641	if (parenthesesDisjunction->m_alternatives [i]->m_startsWithBOL)
642	numBOLAnchoredAlts++;
643	}
644
645	if (numBOLAnchoredAlts) {
646	m_alternative->m_containsBOL = true;
647	// If all the alternatives in parens start with BOL, then so does this one
648	if (numBOLAnchoredAlts == numParenAlternatives)
649	m_alternative->m_startsWithBOL = true;
650	}
651
652	lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
653	m_invertParentheticalAssertion = false;
654	}
655
656	void atomBackReference(unsigned subpatternId)
657	{
658	ASSERT(subpatternId);
659	m_pattern.m_containsBackreferences = true;
660	m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
661
662	if (subpatternId > m_pattern.m_numSubpatterns) {
663	m_alternative->m_terms.append(PatternTerm::ForwardReference());
664	return;
665	}
666
667	PatternAlternative* currentAlternative = m_alternative;
668	ASSERT(currentAlternative);
669
670	// Note to self: if we waited until the AST was baked, we could also remove forwards refs
671	while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
672	PatternTerm& term = currentAlternative->lastTerm();
673	ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) \|\| (term.type == PatternTerm::TypeParentheticalAssertion));
674
675	if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
676	m_alternative->m_terms.append(PatternTerm::ForwardReference());
677	return;
678	}
679	}
680
681	m_alternative->m_terms.append(PatternTerm (subpatternId));
682	}
683
684	void atomNamedBackReference(const String& subpatternName)
685	{
686	ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
687	atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
688	}
689
690	bool isValidNamedForwardReference(const String& subpatternName)
691	{
692	return !m_unmatchedNamedForwardReferences.contains(subpatternName);
693	}
694
695	void atomNamedForwardReference(const String& subpatternName)
696	{
697	m_pattern.m_namedForwardReferences.appendIfNotContains(subpatternName);
698	m_alternative->m_terms.append(PatternTerm::ForwardReference());
699	}
700
701	// deep copy the argument disjunction. If filterStartsWithBOL is true,
702	// skip alternatives with m_startsWithBOL set true.
703	PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
704	{
705	std::unique_ptr<PatternDisjunction> newDisjunction;
706	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
707	PatternAlternative* alternative = disjunction->m_alternatives [alt].get();
708	if (!filterStartsWithBOL \|\| !alternative->m_startsWithBOL) {
709	if (!newDisjunction) {
710	newDisjunction = std::make_unique<PatternDisjunction>();
711	newDisjunction ->m_parent = disjunction->m_parent;
712	}
713	PatternAlternative* newAlternative = newDisjunction ->addNewAlternative();
714	newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
715	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i)
716	newAlternative->m_terms.append(copyTerm(alternative->m_terms [i], filterStartsWithBOL));
717	}
718	}
719
720	if (!newDisjunction)
721	return `0`;
722
723	PatternDisjunction* copiedDisjunction = newDisjunction.get();
724	m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
725	return copiedDisjunction;
726	}
727
728	PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
729	{
730	if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
731	return PatternTerm (term);
732
733	PatternTerm termCopy = term;
734	termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
735	m_pattern.m_hasCopiedParenSubexpressions = true;
736	return termCopy;
737	}
738
739	void quantifyAtom(unsigned min, unsigned max, bool greedy)
740	{
741	ASSERT(min <= max);
742	ASSERT(m_alternative->m_terms.size());
743
744	if (!max) {
745	m_alternative->removeLastTerm();
746	return;
747	}
748
749	PatternTerm& term = m_alternative->lastTerm();
750	ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
751	ASSERT(term.quantityMinCount == `1` && term.quantityMaxCount == `1` && term.quantityType == QuantifierFixedCount);
752
753	if (term.type == PatternTerm::TypeParentheticalAssertion) {
754	// If an assertion is quantified with a minimum count of zero, it can simply be removed.
755	// This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
756	// results in any input being consumed, however the continuation passed to the assertion
757	// (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
758	// reject all zero length matches (see step 2.1). A match from the continuation of the
759	// expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
760	// this is that matches from the assertion are not required, and won't be accepted anyway,
761	// so no need to ever run it.
762	if (!min)
763	m_alternative->removeLastTerm();
764	// We never need to run an assertion more than once. Subsequent interations will be run
765	// with the same start index (since assertions are non-capturing) and the same captures
766	// (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
767	// same result and captures. If the first match succeeds then the subsequent (min - 1)
768	// matches will too. Any additional optional matches will fail (on the same basis as the
769	// minimum zero quantified assertions, above), but this will still result in a match.
770	return;
771	}
772
773	if (min == max)
774	term.quantify(min, max, QuantifierFixedCount);
775	else if (!min \|\| (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
776	term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
777	else {
778	term.quantify(min, min, QuantifierFixedCount);
779	m_alternative->m_terms.append(copyTerm(term));
780	// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
781	m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
782	if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
783	m_alternative->lastTerm().parentheses.isCopy = true;
784	}
785	}
786
787	void disjunction()
788	{
789	m_alternative = m_alternative->m_parent->addNewAlternative();
790	}
791
792	ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
793	{
794	if (UNLIKELY(!isSafeToRecurse()))
795	return ErrorCode::TooManyDisjunctions;
796
797	ErrorCode error = ErrorCode::NoError;
798	alternative->m_hasFixedSize = true;
799	Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
800
801	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i) {
802	PatternTerm& term = alternative->m_terms [i];
803
804	switch (term.type) {
805	case PatternTerm::TypeAssertionBOL:
806	case PatternTerm::TypeAssertionEOL:
807	case PatternTerm::TypeAssertionWordBoundary:
808	term.inputPosition = currentInputPosition.unsafeGet();
809	break;
810
811	case PatternTerm::TypeBackReference:
812	term.inputPosition = currentInputPosition.unsafeGet();
813	term.frameLocation = currentCallFrameSize;
814	currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
815	alternative->m_hasFixedSize = false;
816	break;
817
818	case PatternTerm::TypeForwardReference:
819	break;
820
821	case PatternTerm::TypePatternCharacter:
822	term.inputPosition = currentInputPosition.unsafeGet();
823	if (term.quantityType != QuantifierFixedCount) {
824	term.frameLocation = currentCallFrameSize;
825	currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
826	alternative->m_hasFixedSize = false;
827	} else if (m_pattern.unicode()) {
828	Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
829	tempCount *= U16_LENGTH(term.patternCharacter);
830	if (tempCount.hasOverflowed())
831	return ErrorCode::OffsetTooLarge;
832	currentInputPosition += tempCount;
833	} else
834	currentInputPosition += term.quantityMaxCount;
835	break;
836
837	case PatternTerm::TypeCharacterClass:
838	term.inputPosition = currentInputPosition.unsafeGet();
839	if (term.quantityType != QuantifierFixedCount) {
840	term.frameLocation = currentCallFrameSize;
841	currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
842	alternative->m_hasFixedSize = false;
843	} else if (m_pattern.unicode()) {
844	term.frameLocation = currentCallFrameSize;
845	currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
846	if (term.characterClass->hasOneCharacterSize() && !term.invert()) {
847	Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
848	tempCount *= term.characterClass->hasNonBMPCharacters() ? `2` : `1`;
849	if (tempCount.hasOverflowed())
850	return ErrorCode::OffsetTooLarge;
851	currentInputPosition += tempCount;
852	} else {
853	currentInputPosition += term.quantityMaxCount;
854	alternative->m_hasFixedSize = false;
855	}
856	} else
857	currentInputPosition += term.quantityMaxCount;
858	break;
859
860	case PatternTerm::TypeParenthesesSubpattern:
861	// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
862	term.frameLocation = currentCallFrameSize;
863	if (term.quantityMaxCount == `1` && !term.parentheses.isCopy) {
864	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
865	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
866	if (hasError(error))
867	return error;
868	// If quantity is fixed, then pre-check its minimum size.
869	if (term.quantityType == QuantifierFixedCount)
870	currentInputPosition += term.parentheses.disjunction->m_minimumSize;
871	term.inputPosition = currentInputPosition.unsafeGet();
872	} else if (term.parentheses.isTerminal) {
873	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
874	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
875	if (hasError(error))
876	return error;
877	term.inputPosition = currentInputPosition.unsafeGet();
878	} else {
879	term.inputPosition = currentInputPosition.unsafeGet();
880	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
881	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
882	if (hasError(error))
883	return error;
884	}
885	// Fixed count of 1 could be accepted, if they have a fixed size AND* if all alternatives are of the same length.*
886	alternative->m_hasFixedSize = false;
887	break;
888
889	case PatternTerm::TypeParentheticalAssertion:
890	term.inputPosition = currentInputPosition.unsafeGet();
891	term.frameLocation = currentCallFrameSize;
892	error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
893	if (hasError(error))
894	return error;
895	break;
896
897	case PatternTerm::TypeDotStarEnclosure:
898	ASSERT(!m_pattern.m_saveInitialStartValue);
899	alternative->m_hasFixedSize = false;
900	term.inputPosition = initialInputPosition;
901	m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
902	currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
903	m_pattern.m_saveInitialStartValue = true;
904	break;
905	}
906	if (currentInputPosition.hasOverflowed())
907	return ErrorCode::OffsetTooLarge;
908	}
909
910	alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
911	newCallFrameSize = currentCallFrameSize;
912	return error;
913	}
914
915	ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
916	{
917	if (UNLIKELY(!isSafeToRecurse()))
918	return ErrorCode::TooManyDisjunctions;
919
920	if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > `1`))
921	initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
922
923	unsigned minimumInputSize = UINT_MAX;
924	unsigned maximumCallFrameSize = `0`;
925	bool hasFixedSize = true;
926	ErrorCode error = ErrorCode::NoError;
927
928	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
929	PatternAlternative* alternative = disjunction->m_alternatives [alt].get();
930	unsigned currentAlternativeCallFrameSize;
931	error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
932	if (hasError(error))
933	return error;
934	minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
935	maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
936	hasFixedSize &= alternative->m_hasFixedSize;
937	if (alternative->m_minimumSize > INT_MAX)
938	m_pattern.m_containsUnsignedLengthPattern = true;
939	}
940
941	ASSERT(minimumInputSize != UINT_MAX);
942	ASSERT(maximumCallFrameSize >= initialCallFrameSize);
943
944	disjunction->m_hasFixedSize = hasFixedSize;
945	disjunction->m_minimumSize = minimumInputSize;
946	disjunction->m_callFrameSize = maximumCallFrameSize;
947	callFrameSize = maximumCallFrameSize;
948	return error;
949	}
950
951	ErrorCode setupOffsets()
952	{
953	// FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
954	unsigned ignoredCallFrameSize;
955	return setupDisjunctionOffsets(m_pattern.m_body, `0`, `0`, ignoredCallFrameSize);
956	}
957
958	// This optimization identifies sets of parentheses that we will never need to backtrack.
959	// In these cases we do not need to store state from prior iterations.
960	// We can presently avoid backtracking for:
961	// where the parens are at the end of the regular expression (last term in any of the*
962	// alternatives of the main body disjunction).
963	// where the parens are non-capturing, and quantified unbounded greedy ().
964	// where the parens do not contain any capturing subpatterns.*
965	void checkForTerminalParentheses()
966	{
967	// This check is much too crude; should be just checking whether the candidate
968	// node contains nested capturing subpatterns, not the whole expression!
969	if (m_pattern.m_numSubpatterns)
970	return;
971
972	Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
973	for (size_t i = `0`; i < alternatives.size(); ++i) {
974	Vector<PatternTerm>& terms = alternatives [i]->m_terms;
975	if (terms.size()) {
976	PatternTerm& term = terms.last();
977	if (term.type == PatternTerm::TypeParenthesesSubpattern
978	&& term.quantityType == QuantifierGreedy
979	&& term.quantityMinCount == `0`
980	&& term.quantityMaxCount == quantifyInfinite
981	&& !term.capture())
982	term.parentheses.isTerminal = true;
983	}
984	}
985	}
986
987	void optimizeBOL()
988	{
989	// Look for expressions containing beginning of line (^) anchoring and unroll them.
990	// e.g. /^a\|^b\|c/ becomes /^a\|^b\|c/ which is executed once followed by /c/ which loops
991	// This code relies on the parsing code tagging alternatives with m_containsBOL and
992	// m_startsWithBOL and rolling those up to containing alternatives.
993	// At this point, this is only valid for non-multiline expressions.
994	PatternDisjunction* disjunction = m_pattern.m_body;
995
996	if (!m_pattern.m_containsBOL \|\| m_pattern.multiline())
997	return;
998
999	PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
1000
1001	// Set alternatives in disjunction to "onceThrough"
1002	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt)
1003	disjunction->m_alternatives [alt]->setOnceThrough();
1004
1005	if (loopDisjunction) {
1006	// Move alternatives from loopDisjunction to disjunction
1007	for (unsigned alt = `0`; alt < loopDisjunction->m_alternatives.size(); ++alt)
1008	disjunction->m_alternatives.append(loopDisjunction->m_alternatives [alt].release());
1009
1010	loopDisjunction->m_alternatives.clear();
1011	}
1012	}
1013
1014	bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
1015	{
1016	Vector<PatternTerm>& terms = alternative->m_terms;
1017
1018	ASSERT(endIndex <= terms.size());
1019	for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
1020	PatternTerm& term = terms [termIndex];
1021
1022	if (term.m_capture)
1023	return true;
1024
1025	if (term.type == PatternTerm::TypeParenthesesSubpattern) {
1026	PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
1027	for (unsigned alt = `0`; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
1028	if (containsCapturingTerms(nestedDisjunction->m_alternatives [alt].get(), `0`, nestedDisjunction->m_alternatives [alt]->m_terms.size()))
1029	return true;
1030	}
1031	}
1032	}
1033
1034	return false;
1035	}
1036
1037	// This optimization identifies alternatives in the form of
1038	// [^].[?]<expression>.[$] for expressions that don't have any
1039	// capturing terms. The alternative is changed to <expression>
1040	// followed by processing of the dot stars to find and adjust the
1041	// beginning and the end of the match.
1042	void optimizeDotStarWrappedExpressions()
1043	{
1044	Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
1045	if (alternatives.size() != `1`)
1046	return;
1047
1048	CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
1049	PatternAlternative* alternative = alternatives [`0`].get();
1050	Vector<PatternTerm>& terms = alternative->m_terms;
1051	if (terms.size() >= `3`) {
1052	bool startsWithBOL = false;
1053	bool endsWithEOL = false;
1054	size_t termIndex, firstExpressionTerm;
1055
1056	termIndex = `0`;
1057	if (terms [termIndex].type == PatternTerm::TypeAssertionBOL) {
1058	startsWithBOL = true;
1059	++termIndex;
1060	}
1061
1062	PatternTerm& firstNonAnchorTerm = terms [termIndex];
1063	if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1064	\|\| firstNonAnchorTerm.characterClass != dotCharacterClass
1065	\|\| firstNonAnchorTerm.quantityMinCount
1066	\|\| firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1067	return;
1068
1069	firstExpressionTerm = termIndex + `1`;
1070
1071	termIndex = terms.size() - `1`;
1072	if (terms [termIndex].type == PatternTerm::TypeAssertionEOL) {
1073	endsWithEOL = true;
1074	--termIndex;
1075	}
1076
1077	PatternTerm& lastNonAnchorTerm = terms [termIndex];
1078	if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1079	\|\| lastNonAnchorTerm.characterClass != dotCharacterClass
1080	\|\| lastNonAnchorTerm.quantityType != QuantifierGreedy
1081	\|\| lastNonAnchorTerm.quantityMinCount
1082	\|\| lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1083	return;
1084
1085	size_t endIndex = termIndex;
1086	if (firstExpressionTerm >= endIndex)
1087	return;
1088
1089	if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
1090	for (termIndex = terms.size() - `1`; termIndex >= endIndex; --termIndex)
1091	terms.remove(termIndex);
1092
1093	for (termIndex = firstExpressionTerm; termIndex > `0`; --termIndex)
1094	terms.remove(termIndex - `1`);
1095
1096	terms.append(PatternTerm (startsWithBOL, endsWithEOL));
1097
1098	m_pattern.m_containsBOL = false;
1099	}
1100	}
1101	}
1102
1103	private:
1104	bool isSafeToRecurse() const
1105	{
1106	if (!m_stackLimit)
1107	return true;
1108	ASSERT(Thread::current().stack().isGrowingDownward());
1109	int8_t* curr = reinterpret_cast<int8_t*>(currentStackPointer());
1110	int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
1111	return curr >= limit;
1112	}
1113
1114	YarrPattern& m_pattern;
1115	PatternAlternative* m_alternative;
1116	CharacterClassConstructor m_characterClassConstructor;
1117	Vector<String> m_unmatchedNamedForwardReferences;
1118	void* m_stackLimit;
1119	bool m_invertCharacterClass;
1120	bool m_invertParentheticalAssertion { false };
1121	};
1122
1123	ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit)
1124	{
1125	YarrPatternConstructor constructor(*this, stackLimit);
1126
1127	{
1128	ErrorCode error = parse(constructor, patternString, unicode());
1129	if (hasError(error))
1130	return error;
1131	}
1132
1133	// If the pattern contains illegal backreferences reset & reparse.
1134	// Quoting Netscape's "What's new in JavaScript 1.2",
1135	// "Note: if the number of left parentheses is less than the number specified
1136	// in \#, the \# is taken as an octal escape as described in the next row."
1137	if (containsIllegalBackReference() \|\| containsIllegalNamedForwardReferences()) {
1138	if (unicode())
1139	return ErrorCode::InvalidBackreference;
1140
1141	unsigned numSubpatterns = m_numSubpatterns;
1142
1143	constructor.saveUnmatchedNamedForwardReferences();
1144	constructor.resetForReparsing();
1145	ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns);
1146	ASSERT_UNUSED(error, !hasError(error));
1147	ASSERT(numSubpatterns == m_numSubpatterns);
1148	}
1149
1150	constructor.checkForTerminalParentheses();
1151	constructor.optimizeDotStarWrappedExpressions();
1152	constructor.optimizeBOL();
1153
1154	{
1155	ErrorCode error = constructor.setupOffsets();
1156	if (hasError(error))
1157	return error;
1158	}
1159
1160	if (Options::dumpCompiledRegExpPatterns())
1161	dumpPattern(patternString);
1162
1163	return ErrorCode::NoError;
1164	}
1165
1166	YarrPattern::YarrPattern(const String& pattern, OptionSet<Flags> flags, ErrorCode& error, void* stackLimit)
1167	: m_containsBackreferences(false)
1168	, m_containsBOL(false)
1169	, m_containsUnsignedLengthPattern(false)
1170	, m_hasCopiedParenSubexpressions(false)
1171	, m_saveInitialStartValue(false)
1172	, m_flags (flags)
1173	{
1174	ASSERT(m_flags != Flags::DeletedValue);
1175	error = compile(pattern, stackLimit);
1176	}
1177
1178	void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1179	{
1180	out.print(" ");
1181	for (; nestingDepth; --nestingDepth)
1182	out.print(" ");
1183	}
1184
1185	void dumpUChar32(PrintStream& out, UChar32 c)
1186	{
1187	if (c >= `' '`&& c <= `0xff`)
1188	out.printf("'%c'", static_cast<char>(c));
1189	else
1190	out.printf("0x%04x", c);
1191	}
1192
1193	void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
1194	{
1195	if (characterClass == pattern->anyCharacterClass())
1196	out.print("<any character>");
1197	else if (characterClass == pattern->newlineCharacterClass())
1198	out.print("<newline>");
1199	else if (characterClass == pattern->digitsCharacterClass())
1200	out.print("<digits>");
1201	else if (characterClass == pattern->spacesCharacterClass())
1202	out.print("<whitespace>");
1203	else if (characterClass == pattern->wordcharCharacterClass())
1204	out.print("<word>");
1205	else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
1206	out.print("<unicode word ignore case>");
1207	else if (characterClass == pattern->nondigitsCharacterClass())
1208	out.print("<non-digits>");
1209	else if (characterClass == pattern->nonspacesCharacterClass())
1210	out.print("<non-whitespace>");
1211	else if (characterClass == pattern->nonwordcharCharacterClass())
1212	out.print("<non-word>");
1213	else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1214	out.print("<unicode non-word ignore case>");
1215	else {
1216	bool needMatchesRangesSeperator = false;
1217
1218	auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1219	size_t matchesSize = matches.size();
1220	if (matchesSize) {
1221	if (needMatchesRangesSeperator)
1222	out.print(",");
1223	needMatchesRangesSeperator = true;
1224
1225	out.print(prefix, ":(");
1226	for (size_t i = `0`; i < matchesSize; ++i) {
1227	if (i)
1228	out.print(",");
1229	dumpUChar32(out, matches [i]);
1230	}
1231	out.print(")");
1232	}
1233	};
1234
1235	auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1236	size_t rangeSize = ranges.size();
1237	if (rangeSize) {
1238	if (needMatchesRangesSeperator)
1239	out.print(",");
1240	needMatchesRangesSeperator = true;
1241
1242	out.print(prefix, " ranges:(");
1243	for (size_t i = `0`; i < rangeSize; ++i) {
1244	if (i)
1245	out.print(",");
1246	CharacterRange range = ranges [i];
1247	out.print("(");
1248	dumpUChar32(out, range.begin);
1249	out.print("..");
1250	dumpUChar32(out, range.end);
1251	out.print(")");
1252	}
1253	out.print(")");
1254	}
1255	};
1256
1257	out.print("[");
1258	dumpMatches ("ASCII", characterClass->m_matches);
1259	dumpRanges ("ASCII", characterClass->m_ranges);
1260	dumpMatches ("Unicode", characterClass->m_matchesUnicode);
1261	dumpRanges ("Unicode", characterClass->m_rangesUnicode);
1262	out.print("]");
1263	}
1264	}
1265
1266	void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1267	{
1268	out.print("minimum size: ", m_minimumSize);
1269	if (m_hasFixedSize)
1270	out.print(",fixed size");
1271	if (m_onceThrough)
1272	out.print(",once through");
1273	if (m_startsWithBOL)
1274	out.print(",starts with ^");
1275	if (m_containsBOL)
1276	out.print(",contains ^");
1277	out.print("\n");
1278
1279	for (size_t i = `0`; i < m_terms.size(); ++i)
1280	m_terms [i].dump(out, thisPattern, nestingDepth);
1281	}
1282
1283	void PatternTerm::dumpQuantifier(PrintStream& out)
1284	{
1285	if (quantityType == QuantifierFixedCount && quantityMinCount == `1` && quantityMaxCount == `1`)
1286	return;
1287	out.print(" {", quantityMinCount.unsafeGet());
1288	if (quantityMinCount != quantityMaxCount) {
1289	if (quantityMaxCount == UINT_MAX)
1290	out.print(",...");
1291	else
1292	out.print(",", quantityMaxCount.unsafeGet());
1293	}
1294	out.print("}");
1295	if (quantityType == QuantifierGreedy)
1296	out.print(" greedy");
1297	else if (quantityType == QuantifierNonGreedy)
1298	out.print(" non-greedy");
1299	}
1300
1301	void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1302	{
1303	indentForNestingLevel(out, nestingDepth);
1304
1305	if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
1306	if (invert())
1307	out.print("not ");
1308	}
1309
1310	switch (type) {
1311	case TypeAssertionBOL:
1312	out.println("BOL");
1313	break;
1314	case TypeAssertionEOL:
1315	out.println("EOL");
1316	break;
1317	case TypeAssertionWordBoundary:
1318	out.println("word boundary");
1319	break;
1320	case TypePatternCharacter:
1321	out.printf("character ");
1322	out.printf("inputPosition %u ", inputPosition);
1323	if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1324	dumpUChar32(out, toASCIIUpper(patternCharacter));
1325	out.print("/");
1326	dumpUChar32(out, toASCIILower(patternCharacter));
1327	} else
1328	dumpUChar32(out, patternCharacter);
1329	dumpQuantifier(out);
1330	if (quantityType != QuantifierFixedCount)
1331	out.print(",frame location ", frameLocation);
1332	out.println();
1333	break;
1334	case TypeCharacterClass:
1335	out.print("character class ");
1336	out.printf("inputPosition %u ", inputPosition);
1337	dumpCharacterClass(out, thisPattern, characterClass);
1338	dumpQuantifier(out);
1339	if (quantityType != QuantifierFixedCount \|\| thisPattern->unicode())
1340	out.print(",frame location ", frameLocation);
1341	out.println();
1342	break;
1343	case TypeBackReference:
1344	out.print("back reference to subpattern #", backReferenceSubpatternId);
1345	out.println(",frame location ", frameLocation);
1346	break;
1347	case TypeForwardReference:
1348	out.println("forward reference");
1349	break;
1350	case TypeParenthesesSubpattern:
1351	if (m_capture)
1352	out.print("captured ");
1353	else
1354	out.print("non-captured ");
1355
1356	FALLTHROUGH;
1357	case TypeParentheticalAssertion:
1358	if (m_invert)
1359	out.print("inverted ");
1360
1361	if (type == TypeParenthesesSubpattern)
1362	out.print("subpattern");
1363	else if (type == TypeParentheticalAssertion)
1364	out.print("assertion");
1365
1366	if (m_capture)
1367	out.print(" #", parentheses.subpatternId);
1368
1369	dumpQuantifier(out);
1370
1371	if (parentheses.isCopy)
1372	out.print(",copy");
1373
1374	if (parentheses.isTerminal)
1375	out.print(",terminal");
1376
1377	out.println(",frame location ", frameLocation);
1378
1379	if (parentheses.disjunction->m_alternatives.size() > `1`) {
1380	indentForNestingLevel(out, nestingDepth + `1`);
1381	unsigned alternativeFrameLocation = frameLocation;
1382	if (quantityMaxCount == `1` && !parentheses.isCopy)
1383	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1384	else if (parentheses.isTerminal)
1385	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1386	else
1387	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
1388	out.println("alternative list,frame location ", alternativeFrameLocation);
1389	}
1390
1391	parentheses.disjunction->dump(out, thisPattern, nestingDepth + `1`);
1392	break;
1393	case TypeDotStarEnclosure:
1394	out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1395	break;
1396	}
1397	}
1398
1399	void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = `0`)
1400	{
1401	unsigned alternativeCount = m_alternatives.size();
1402	for (unsigned i = `0`; i < alternativeCount; ++i) {
1403	indentForNestingLevel(out, nestingDepth);
1404	if (alternativeCount > `1`)
1405	out.print("alternative #", i, ": ");
1406	m_alternatives [i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > `1`));
1407	}
1408	}
1409
1410	void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString)
1411	{
1412	out.print("/", patternString, "/");
1413
1414	if (global())
1415	out.print("g");
1416	if (ignoreCase())
1417	out.print("i");
1418	if (multiline())
1419	out.print("m");
1420	if (unicode())
1421	out.print("u");
1422	if (sticky())
1423	out.print("y");
1424	}
1425
1426	void YarrPattern::dumpPattern(const String& patternString)
1427	{
1428	dumpPattern(WTF::dataFile(), patternString);
1429	}
1430
1431	void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1432	{
1433	out.print("RegExp pattern for ");
1434	dumpPatternString(out, patternString);
1435
1436	if (m_flags) {
1437	bool printSeperator = false;
1438	out.print(" (");
1439	if (global()) {
1440	out.print("global");
1441	printSeperator = true;
1442	}
1443	if (ignoreCase()) {
1444	if (printSeperator)
1445	out.print("\|");
1446	out.print("ignore case");
1447	printSeperator = true;
1448	}
1449	if (multiline()) {
1450	if (printSeperator)
1451	out.print("\|");
1452	out.print("multiline");
1453	printSeperator = true;
1454	}
1455	if (unicode()) {
1456	if (printSeperator)
1457	out.print("\|");
1458	out.print("unicode");
1459	printSeperator = true;
1460	}
1461	if (sticky()) {
1462	if (printSeperator)
1463	out.print("\|");
1464	out.print("sticky");
1465	printSeperator = true;
1466	}
1467	out.print(")");
1468	}
1469	out.print(":\n");
1470	if (m_body->m_callFrameSize)
1471	out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
1472	m_body->dump(out, this);
1473	}
1474
1475	std::unique_ptr<CharacterClass> anycharCreate()
1476	{
1477	auto characterClass = std::make_unique<CharacterClass>();
1478	characterClass ->m_ranges.append(CharacterRange (`0x00`, `0x7f`));
1479	characterClass ->m_rangesUnicode.append(CharacterRange (`0x0080`, `0x10ffff`));
1480	characterClass ->m_characterWidths = CharacterClassWidths::HasBothBMPAndNonBMP;
1481	characterClass ->m_anyCharacter = true;
1482	return characterClass;
1483	}
1484
1485	} } // namespace JSC::Yarr
1486

Browse the source code of webcore/Source/JavaScriptCore/yarr/YarrPattern.cpp