1/*
2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#pragma once
28
29#include "YarrErrorCode.h"
30#include "YarrFlags.h"
31#include "YarrUnicodeProperties.h"
32#include <wtf/CheckedArithmetic.h>
33#include <wtf/HashMap.h>
34#include <wtf/OptionSet.h>
35#include <wtf/PrintStream.h>
36#include <wtf/Vector.h>
37#include <wtf/text/StringHash.h>
38
39namespace JSC { namespace Yarr {
40
41struct YarrPattern;
42struct PatternDisjunction;
43
44struct CharacterRange {
45 UChar32 begin { 0 };
46 UChar32 end { 0x10ffff };
47
48 CharacterRange(UChar32 begin, UChar32 end)
49 : begin(begin)
50 , end(end)
51 {
52 }
53};
54
55enum struct CharacterClassWidths : unsigned char {
56 Unknown = 0x0,
57 HasBMPChars = 0x1,
58 HasNonBMPChars = 0x2,
59 HasBothBMPAndNonBMP = HasBMPChars | HasNonBMPChars
60};
61
62inline CharacterClassWidths operator|(CharacterClassWidths lhs, CharacterClassWidths rhs)
63{
64 return static_cast<CharacterClassWidths>(static_cast<unsigned>(lhs) | static_cast<unsigned>(rhs));
65}
66
67inline bool operator&(CharacterClassWidths lhs, CharacterClassWidths rhs)
68{
69 return static_cast<unsigned>(lhs) & static_cast<unsigned>(rhs);
70}
71
72inline CharacterClassWidths& operator|=(CharacterClassWidths& lhs, CharacterClassWidths rhs)
73{
74 lhs = lhs | rhs;
75 return lhs;
76}
77
78struct CharacterClass {
79 WTF_MAKE_FAST_ALLOCATED;
80public:
81 // All CharacterClass instances have to have the full set of matches and ranges,
82 // they may have an optional m_table for faster lookups (which must match the
83 // specified matches and ranges)
84 CharacterClass()
85 : m_table(0)
86 , m_characterWidths(CharacterClassWidths::Unknown)
87 , m_anyCharacter(false)
88 {
89 }
90 CharacterClass(const char* table, bool inverted)
91 : m_table(table)
92 , m_characterWidths(CharacterClassWidths::Unknown)
93 , m_tableInverted(inverted)
94 , m_anyCharacter(false)
95 {
96 }
97 CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode, CharacterClassWidths widths)
98 : m_matches(matches)
99 , m_ranges(ranges)
100 , m_matchesUnicode(matchesUnicode)
101 , m_rangesUnicode(rangesUnicode)
102 , m_table(0)
103 , m_characterWidths(widths)
104 , m_tableInverted(false)
105 , m_anyCharacter(false)
106 {
107 }
108
109 bool hasNonBMPCharacters() { return m_characterWidths & CharacterClassWidths::HasNonBMPChars; }
110
111 bool hasOneCharacterSize() { return m_characterWidths == CharacterClassWidths::HasBMPChars || m_characterWidths == CharacterClassWidths::HasNonBMPChars; }
112 bool hasOnlyNonBMPCharacters() { return m_characterWidths == CharacterClassWidths::HasNonBMPChars; }
113
114 Vector<UChar32> m_matches;
115 Vector<CharacterRange> m_ranges;
116 Vector<UChar32> m_matchesUnicode;
117 Vector<CharacterRange> m_rangesUnicode;
118
119 const char* m_table;
120 CharacterClassWidths m_characterWidths;
121 bool m_tableInverted : 1;
122 bool m_anyCharacter : 1;
123};
124
125enum QuantifierType : uint8_t {
126 QuantifierFixedCount,
127 QuantifierGreedy,
128 QuantifierNonGreedy,
129};
130
131struct PatternTerm {
132 enum Type : uint8_t {
133 TypeAssertionBOL,
134 TypeAssertionEOL,
135 TypeAssertionWordBoundary,
136 TypePatternCharacter,
137 TypeCharacterClass,
138 TypeBackReference,
139 TypeForwardReference,
140 TypeParenthesesSubpattern,
141 TypeParentheticalAssertion,
142 TypeDotStarEnclosure,
143 } type;
144 bool m_capture :1;
145 bool m_invert :1;
146 QuantifierType quantityType;
147 Checked<unsigned> quantityMinCount;
148 Checked<unsigned> quantityMaxCount;
149 union {
150 UChar32 patternCharacter;
151 CharacterClass* characterClass;
152 unsigned backReferenceSubpatternId;
153 struct {
154 PatternDisjunction* disjunction;
155 unsigned subpatternId;
156 unsigned lastSubpatternId;
157 bool isCopy;
158 bool isTerminal;
159 } parentheses;
160 struct {
161 bool bolAnchor : 1;
162 bool eolAnchor : 1;
163 } anchors;
164 };
165 unsigned inputPosition;
166 unsigned frameLocation;
167
168 PatternTerm(UChar32 ch)
169 : type(PatternTerm::TypePatternCharacter)
170 , m_capture(false)
171 , m_invert(false)
172 {
173 patternCharacter = ch;
174 quantityType = QuantifierFixedCount;
175 quantityMinCount = quantityMaxCount = 1;
176 }
177
178 PatternTerm(CharacterClass* charClass, bool invert)
179 : type(PatternTerm::TypeCharacterClass)
180 , m_capture(false)
181 , m_invert(invert)
182 {
183 characterClass = charClass;
184 quantityType = QuantifierFixedCount;
185 quantityMinCount = quantityMaxCount = 1;
186 }
187
188 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
189 : type(type)
190 , m_capture(capture)
191 , m_invert(invert)
192 {
193 parentheses.disjunction = disjunction;
194 parentheses.subpatternId = subpatternId;
195 parentheses.isCopy = false;
196 parentheses.isTerminal = false;
197 quantityType = QuantifierFixedCount;
198 quantityMinCount = quantityMaxCount = 1;
199 }
200
201 PatternTerm(Type type, bool invert = false)
202 : type(type)
203 , m_capture(false)
204 , m_invert(invert)
205 {
206 quantityType = QuantifierFixedCount;
207 quantityMinCount = quantityMaxCount = 1;
208 }
209
210 PatternTerm(unsigned spatternId)
211 : type(TypeBackReference)
212 , m_capture(false)
213 , m_invert(false)
214 {
215 backReferenceSubpatternId = spatternId;
216 quantityType = QuantifierFixedCount;
217 quantityMinCount = quantityMaxCount = 1;
218 }
219
220 PatternTerm(bool bolAnchor, bool eolAnchor)
221 : type(TypeDotStarEnclosure)
222 , m_capture(false)
223 , m_invert(false)
224 {
225 anchors.bolAnchor = bolAnchor;
226 anchors.eolAnchor = eolAnchor;
227 quantityType = QuantifierFixedCount;
228 quantityMinCount = quantityMaxCount = 1;
229 }
230
231 static PatternTerm ForwardReference()
232 {
233 return PatternTerm(TypeForwardReference);
234 }
235
236 static PatternTerm BOL()
237 {
238 return PatternTerm(TypeAssertionBOL);
239 }
240
241 static PatternTerm EOL()
242 {
243 return PatternTerm(TypeAssertionEOL);
244 }
245
246 static PatternTerm WordBoundary(bool invert)
247 {
248 return PatternTerm(TypeAssertionWordBoundary, invert);
249 }
250
251 bool invert() const
252 {
253 return m_invert;
254 }
255
256 bool capture()
257 {
258 return m_capture;
259 }
260
261 bool isFixedWidthCharacterClass() const
262 {
263 return type == TypeCharacterClass && characterClass->hasOneCharacterSize() && !invert();
264 }
265
266 bool containsAnyCaptures()
267 {
268 ASSERT(this->type == TypeParenthesesSubpattern);
269 return parentheses.lastSubpatternId >= parentheses.subpatternId;
270 }
271
272 void quantify(unsigned count, QuantifierType type)
273 {
274 quantityMinCount = 0;
275 quantityMaxCount = count;
276 quantityType = type;
277 }
278
279 void quantify(unsigned minCount, unsigned maxCount, QuantifierType type)
280 {
281 // Currently only Parentheses can specify a non-zero min with a different max.
282 ASSERT(this->type == TypeParenthesesSubpattern || !minCount || minCount == maxCount);
283 ASSERT(minCount <= maxCount);
284 quantityMinCount = minCount;
285 quantityMaxCount = maxCount;
286 quantityType = type;
287 }
288
289 void dumpQuantifier(PrintStream&);
290 void dump(PrintStream&, YarrPattern*, unsigned);
291};
292
293struct PatternAlternative {
294 WTF_MAKE_FAST_ALLOCATED;
295public:
296 PatternAlternative(PatternDisjunction* disjunction)
297 : m_parent(disjunction)
298 , m_onceThrough(false)
299 , m_hasFixedSize(false)
300 , m_startsWithBOL(false)
301 , m_containsBOL(false)
302 {
303 }
304
305 PatternTerm& lastTerm()
306 {
307 ASSERT(m_terms.size());
308 return m_terms[m_terms.size() - 1];
309 }
310
311 void removeLastTerm()
312 {
313 ASSERT(m_terms.size());
314 m_terms.shrink(m_terms.size() - 1);
315 }
316
317 void setOnceThrough()
318 {
319 m_onceThrough = true;
320 }
321
322 bool onceThrough()
323 {
324 return m_onceThrough;
325 }
326
327 void dump(PrintStream&, YarrPattern*, unsigned);
328
329 Vector<PatternTerm> m_terms;
330 PatternDisjunction* m_parent;
331 unsigned m_minimumSize;
332 bool m_onceThrough : 1;
333 bool m_hasFixedSize : 1;
334 bool m_startsWithBOL : 1;
335 bool m_containsBOL : 1;
336};
337
338struct PatternDisjunction {
339 WTF_MAKE_FAST_ALLOCATED;
340public:
341 PatternDisjunction(PatternAlternative* parent = 0)
342 : m_parent(parent)
343 , m_hasFixedSize(false)
344 {
345 }
346
347 PatternAlternative* addNewAlternative()
348 {
349 m_alternatives.append(makeUnique<PatternAlternative>(this));
350 return static_cast<PatternAlternative*>(m_alternatives.last().get());
351 }
352
353 void dump(PrintStream&, YarrPattern*, unsigned);
354
355 Vector<std::unique_ptr<PatternAlternative>> m_alternatives;
356 PatternAlternative* m_parent;
357 unsigned m_minimumSize;
358 unsigned m_callFrameSize;
359 bool m_hasFixedSize;
360};
361
362// You probably don't want to be calling these functions directly
363// (please to be calling newlineCharacterClass() et al on your
364// friendly neighborhood YarrPattern instance to get nicely
365// cached copies).
366
367std::unique_ptr<CharacterClass> anycharCreate();
368std::unique_ptr<CharacterClass> newlineCreate();
369std::unique_ptr<CharacterClass> digitsCreate();
370std::unique_ptr<CharacterClass> spacesCreate();
371std::unique_ptr<CharacterClass> wordcharCreate();
372std::unique_ptr<CharacterClass> wordUnicodeIgnoreCaseCharCreate();
373std::unique_ptr<CharacterClass> nondigitsCreate();
374std::unique_ptr<CharacterClass> nonspacesCreate();
375std::unique_ptr<CharacterClass> nonwordcharCreate();
376std::unique_ptr<CharacterClass> nonwordUnicodeIgnoreCaseCharCreate();
377
378struct TermChain {
379 TermChain(PatternTerm term)
380 : term(term)
381 {}
382
383 PatternTerm term;
384 Vector<TermChain> hotTerms;
385};
386
387
388struct YarrPattern {
389 JS_EXPORT_PRIVATE YarrPattern(const String& pattern, OptionSet<Flags>, ErrorCode&, void* stackLimit = nullptr);
390
391 void resetForReparsing()
392 {
393 m_numSubpatterns = 0;
394 m_maxBackReference = 0;
395 m_initialStartValueFrameLocation = 0;
396
397 m_containsBackreferences = false;
398 m_containsBOL = false;
399 m_containsUnsignedLengthPattern = false;
400 m_hasCopiedParenSubexpressions = false;
401 m_saveInitialStartValue = false;
402
403 anycharCached = nullptr;
404 newlineCached = nullptr;
405 digitsCached = nullptr;
406 spacesCached = nullptr;
407 wordcharCached = nullptr;
408 wordUnicodeIgnoreCaseCharCached = nullptr;
409 nondigitsCached = nullptr;
410 nonspacesCached = nullptr;
411 nonwordcharCached = nullptr;
412 nonwordUnicodeIgnoreCasecharCached = nullptr;
413 unicodePropertiesCached.clear();
414
415 m_disjunctions.clear();
416 m_userCharacterClasses.clear();
417 m_captureGroupNames.shrink(0);
418 m_namedForwardReferences.shrink(0);
419 }
420
421 bool containsIllegalBackReference()
422 {
423 return m_maxBackReference > m_numSubpatterns;
424 }
425
426 bool containsIllegalNamedForwardReferences()
427 {
428 if (m_namedForwardReferences.isEmpty())
429 return false;
430
431 for (auto& entry : m_namedForwardReferences) {
432 if (!m_captureGroupNames.contains(entry))
433 return true;
434 }
435
436 return false;
437 }
438
439 bool containsUnsignedLengthPattern()
440 {
441 return m_containsUnsignedLengthPattern;
442 }
443
444 CharacterClass* anyCharacterClass()
445 {
446 if (!anycharCached) {
447 m_userCharacterClasses.append(anycharCreate());
448 anycharCached = m_userCharacterClasses.last().get();
449 }
450 return anycharCached;
451 }
452 CharacterClass* newlineCharacterClass()
453 {
454 if (!newlineCached) {
455 m_userCharacterClasses.append(newlineCreate());
456 newlineCached = m_userCharacterClasses.last().get();
457 }
458 return newlineCached;
459 }
460 CharacterClass* digitsCharacterClass()
461 {
462 if (!digitsCached) {
463 m_userCharacterClasses.append(digitsCreate());
464 digitsCached = m_userCharacterClasses.last().get();
465 }
466 return digitsCached;
467 }
468 CharacterClass* spacesCharacterClass()
469 {
470 if (!spacesCached) {
471 m_userCharacterClasses.append(spacesCreate());
472 spacesCached = m_userCharacterClasses.last().get();
473 }
474 return spacesCached;
475 }
476 CharacterClass* wordcharCharacterClass()
477 {
478 if (!wordcharCached) {
479 m_userCharacterClasses.append(wordcharCreate());
480 wordcharCached = m_userCharacterClasses.last().get();
481 }
482 return wordcharCached;
483 }
484 CharacterClass* wordUnicodeIgnoreCaseCharCharacterClass()
485 {
486 if (!wordUnicodeIgnoreCaseCharCached) {
487 m_userCharacterClasses.append(wordUnicodeIgnoreCaseCharCreate());
488 wordUnicodeIgnoreCaseCharCached = m_userCharacterClasses.last().get();
489 }
490 return wordUnicodeIgnoreCaseCharCached;
491 }
492 CharacterClass* nondigitsCharacterClass()
493 {
494 if (!nondigitsCached) {
495 m_userCharacterClasses.append(nondigitsCreate());
496 nondigitsCached = m_userCharacterClasses.last().get();
497 }
498 return nondigitsCached;
499 }
500 CharacterClass* nonspacesCharacterClass()
501 {
502 if (!nonspacesCached) {
503 m_userCharacterClasses.append(nonspacesCreate());
504 nonspacesCached = m_userCharacterClasses.last().get();
505 }
506 return nonspacesCached;
507 }
508 CharacterClass* nonwordcharCharacterClass()
509 {
510 if (!nonwordcharCached) {
511 m_userCharacterClasses.append(nonwordcharCreate());
512 nonwordcharCached = m_userCharacterClasses.last().get();
513 }
514 return nonwordcharCached;
515 }
516 CharacterClass* nonwordUnicodeIgnoreCaseCharCharacterClass()
517 {
518 if (!nonwordUnicodeIgnoreCasecharCached) {
519 m_userCharacterClasses.append(nonwordUnicodeIgnoreCaseCharCreate());
520 nonwordUnicodeIgnoreCasecharCached = m_userCharacterClasses.last().get();
521 }
522 return nonwordUnicodeIgnoreCasecharCached;
523 }
524 CharacterClass* unicodeCharacterClassFor(BuiltInCharacterClassID unicodeClassID)
525 {
526 ASSERT(unicodeClassID >= BuiltInCharacterClassID::BaseUnicodePropertyID);
527
528 unsigned classID = static_cast<unsigned>(unicodeClassID);
529
530 if (unicodePropertiesCached.find(classID) == unicodePropertiesCached.end()) {
531 m_userCharacterClasses.append(createUnicodeCharacterClassFor(unicodeClassID));
532 CharacterClass* result = m_userCharacterClasses.last().get();
533 unicodePropertiesCached.add(classID, result);
534 return result;
535 }
536
537 return unicodePropertiesCached.get(classID);
538 }
539
540 void dumpPatternString(PrintStream& out, const String& patternString);
541 void dumpPattern(const String& pattern);
542 void dumpPattern(PrintStream& out, const String& pattern);
543
544 bool global() const { return m_flags.contains(Flags::Global); }
545 bool ignoreCase() const { return m_flags.contains(Flags::IgnoreCase); }
546 bool multiline() const { return m_flags.contains(Flags::Multiline); }
547 bool sticky() const { return m_flags.contains(Flags::Sticky); }
548 bool unicode() const { return m_flags.contains(Flags::Unicode); }
549 bool dotAll() const { return m_flags.contains(Flags::DotAll); }
550
551 bool m_containsBackreferences : 1;
552 bool m_containsBOL : 1;
553 bool m_containsUnsignedLengthPattern : 1;
554 bool m_hasCopiedParenSubexpressions : 1;
555 bool m_saveInitialStartValue : 1;
556 OptionSet<Flags> m_flags;
557 unsigned m_numSubpatterns { 0 };
558 unsigned m_maxBackReference { 0 };
559 unsigned m_initialStartValueFrameLocation { 0 };
560 PatternDisjunction* m_body;
561 Vector<std::unique_ptr<PatternDisjunction>, 4> m_disjunctions;
562 Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses;
563 Vector<String> m_captureGroupNames;
564 Vector<String> m_namedForwardReferences;
565 HashMap<String, unsigned> m_namedGroupToParenIndex;
566
567private:
568 ErrorCode compile(const String& patternString, void* stackLimit);
569
570 CharacterClass* anycharCached { nullptr };
571 CharacterClass* newlineCached { nullptr };
572 CharacterClass* digitsCached { nullptr };
573 CharacterClass* spacesCached { nullptr };
574 CharacterClass* wordcharCached { nullptr };
575 CharacterClass* wordUnicodeIgnoreCaseCharCached { nullptr };
576 CharacterClass* nondigitsCached { nullptr };
577 CharacterClass* nonspacesCached { nullptr };
578 CharacterClass* nonwordcharCached { nullptr };
579 CharacterClass* nonwordUnicodeIgnoreCasecharCached { nullptr };
580 HashMap<unsigned, CharacterClass*> unicodePropertiesCached;
581};
582
583 void indentForNestingLevel(PrintStream&, unsigned);
584 void dumpUChar32(PrintStream&, UChar32);
585 void dumpCharacterClass(PrintStream&, YarrPattern*, CharacterClass*);
586
587 struct BackTrackInfoPatternCharacter {
588 uintptr_t begin; // Only needed for unicode patterns
589 uintptr_t matchAmount;
590
591 static unsigned beginIndex() { return offsetof(BackTrackInfoPatternCharacter, begin) / sizeof(uintptr_t); }
592 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoPatternCharacter, matchAmount) / sizeof(uintptr_t); }
593 };
594
595 struct BackTrackInfoCharacterClass {
596 uintptr_t begin; // Only needed for unicode patterns
597 uintptr_t matchAmount;
598
599 static unsigned beginIndex() { return offsetof(BackTrackInfoCharacterClass, begin) / sizeof(uintptr_t); }
600 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoCharacterClass, matchAmount) / sizeof(uintptr_t); }
601 };
602
603 struct BackTrackInfoBackReference {
604 uintptr_t begin; // Not really needed for greedy quantifiers.
605 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
606
607 static unsigned beginIndex() { return offsetof(BackTrackInfoBackReference, begin) / sizeof(uintptr_t); }
608 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoBackReference, matchAmount) / sizeof(uintptr_t); }
609 };
610
611 struct BackTrackInfoAlternative {
612 union {
613 uintptr_t offset;
614 };
615 };
616
617 struct BackTrackInfoParentheticalAssertion {
618 uintptr_t begin;
619
620 static unsigned beginIndex() { return offsetof(BackTrackInfoParentheticalAssertion, begin) / sizeof(uintptr_t); }
621 };
622
623 struct BackTrackInfoParenthesesOnce {
624 uintptr_t begin;
625 uintptr_t returnAddress;
626
627 static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesOnce, begin) / sizeof(uintptr_t); }
628 static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParenthesesOnce, returnAddress) / sizeof(uintptr_t); }
629 };
630
631 struct BackTrackInfoParenthesesTerminal {
632 uintptr_t begin;
633
634 static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesTerminal, begin) / sizeof(uintptr_t); }
635 };
636
637 struct BackTrackInfoParentheses {
638 uintptr_t begin;
639 uintptr_t returnAddress;
640 uintptr_t matchAmount;
641 uintptr_t parenContextHead;
642
643 static unsigned beginIndex() { return offsetof(BackTrackInfoParentheses, begin) / sizeof(uintptr_t); }
644 static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParentheses, returnAddress) / sizeof(uintptr_t); }
645 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoParentheses, matchAmount) / sizeof(uintptr_t); }
646 static unsigned parenContextHeadIndex() { return offsetof(BackTrackInfoParentheses, parenContextHead) / sizeof(uintptr_t); }
647 };
648
649} } // namespace JSC::Yarr
650