1/*
2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#pragma once
28
29#include "YarrErrorCode.h"
30#include "YarrFlags.h"
31#include "YarrUnicodeProperties.h"
32#include <wtf/CheckedArithmetic.h>
33#include <wtf/HashMap.h>
34#include <wtf/OptionSet.h>
35#include <wtf/PrintStream.h>
36#include <wtf/Vector.h>
37#include <wtf/text/StringHash.h>
38
39namespace JSC { namespace Yarr {
40
41struct YarrPattern;
42struct PatternDisjunction;
43
44struct CharacterRange {
45 UChar32 begin { 0 };
46 UChar32 end { 0x10ffff };
47
48 CharacterRange(UChar32 begin, UChar32 end)
49 : begin(begin)
50 , end(end)
51 {
52 }
53};
54
55enum struct CharacterClassWidths : unsigned char {
56 Unknown = 0x0,
57 HasBMPChars = 0x1,
58 HasNonBMPChars = 0x2,
59 HasBothBMPAndNonBMP = HasBMPChars | HasNonBMPChars
60};
61
62inline CharacterClassWidths operator|(CharacterClassWidths lhs, CharacterClassWidths rhs)
63{
64 return static_cast<CharacterClassWidths>(static_cast<unsigned>(lhs) | static_cast<unsigned>(rhs));
65}
66
67inline bool operator&(CharacterClassWidths lhs, CharacterClassWidths rhs)
68{
69 return static_cast<unsigned>(lhs) & static_cast<unsigned>(rhs);
70}
71
72inline CharacterClassWidths& operator|=(CharacterClassWidths& lhs, CharacterClassWidths rhs)
73{
74 lhs = lhs | rhs;
75 return lhs;
76}
77
78struct CharacterClass {
79 WTF_MAKE_FAST_ALLOCATED;
80public:
81 // All CharacterClass instances have to have the full set of matches and ranges,
82 // they may have an optional m_table for faster lookups (which must match the
83 // specified matches and ranges)
84 CharacterClass()
85 : m_table(0)
86 , m_characterWidths(CharacterClassWidths::Unknown)
87 , m_anyCharacter(false)
88 {
89 }
90 CharacterClass(const char* table, bool inverted)
91 : m_table(table)
92 , m_characterWidths(CharacterClassWidths::Unknown)
93 , m_tableInverted(inverted)
94 , m_anyCharacter(false)
95 {
96 }
97 CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode, CharacterClassWidths widths)
98 : m_matches(matches)
99 , m_ranges(ranges)
100 , m_matchesUnicode(matchesUnicode)
101 , m_rangesUnicode(rangesUnicode)
102 , m_table(0)
103 , m_characterWidths(widths)
104 , m_tableInverted(false)
105 , m_anyCharacter(false)
106 {
107 }
108
109 bool hasNonBMPCharacters() { return m_characterWidths & CharacterClassWidths::HasNonBMPChars; }
110
111 bool hasOneCharacterSize() { return m_characterWidths == CharacterClassWidths::HasBMPChars || m_characterWidths == CharacterClassWidths::HasNonBMPChars; }
112 bool hasOnlyNonBMPCharacters() { return m_characterWidths == CharacterClassWidths::HasNonBMPChars; }
113
114 Vector<UChar32> m_matches;
115 Vector<CharacterRange> m_ranges;
116 Vector<UChar32> m_matchesUnicode;
117 Vector<CharacterRange> m_rangesUnicode;
118
119 const char* m_table;
120 CharacterClassWidths m_characterWidths;
121 bool m_tableInverted : 1;
122 bool m_anyCharacter : 1;
123};
124
125enum QuantifierType : uint8_t {
126 QuantifierFixedCount,
127 QuantifierGreedy,
128 QuantifierNonGreedy,
129};
130
131struct PatternTerm {
132 enum Type : uint8_t {
133 TypeAssertionBOL,
134 TypeAssertionEOL,
135 TypeAssertionWordBoundary,
136 TypePatternCharacter,
137 TypeCharacterClass,
138 TypeBackReference,
139 TypeForwardReference,
140 TypeParenthesesSubpattern,
141 TypeParentheticalAssertion,
142 TypeDotStarEnclosure,
143 } type;
144 bool m_capture :1;
145 bool m_invert :1;
146 QuantifierType quantityType;
147 Checked<unsigned> quantityMinCount;
148 Checked<unsigned> quantityMaxCount;
149 union {
150 UChar32 patternCharacter;
151 CharacterClass* characterClass;
152 unsigned backReferenceSubpatternId;
153 struct {
154 PatternDisjunction* disjunction;
155 unsigned subpatternId;
156 unsigned lastSubpatternId;
157 bool isCopy;
158 bool isTerminal;
159 } parentheses;
160 struct {
161 bool bolAnchor : 1;
162 bool eolAnchor : 1;
163 } anchors;
164 };
165 unsigned inputPosition;
166 unsigned frameLocation;
167
168 PatternTerm(UChar32 ch)
169 : type(PatternTerm::TypePatternCharacter)
170 , m_capture(false)
171 , m_invert(false)
172 {
173 patternCharacter = ch;
174 quantityType = QuantifierFixedCount;
175 quantityMinCount = quantityMaxCount = 1;
176 }
177
178 PatternTerm(CharacterClass* charClass, bool invert)
179 : type(PatternTerm::TypeCharacterClass)
180 , m_capture(false)
181 , m_invert(invert)
182 {
183 characterClass = charClass;
184 quantityType = QuantifierFixedCount;
185 quantityMinCount = quantityMaxCount = 1;
186 }
187
188 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
189 : type(type)
190 , m_capture(capture)
191 , m_invert(invert)
192 {
193 parentheses.disjunction = disjunction;
194 parentheses.subpatternId = subpatternId;
195 parentheses.isCopy = false;
196 parentheses.isTerminal = false;
197 quantityType = QuantifierFixedCount;
198 quantityMinCount = quantityMaxCount = 1;
199 }
200
201 PatternTerm(Type type, bool invert = false)
202 : type(type)
203 , m_capture(false)
204 , m_invert(invert)
205 {
206 quantityType = QuantifierFixedCount;
207 quantityMinCount = quantityMaxCount = 1;
208 }
209
210 PatternTerm(unsigned spatternId)
211 : type(TypeBackReference)
212 , m_capture(false)
213 , m_invert(false)
214 {
215 backReferenceSubpatternId = spatternId;
216 quantityType = QuantifierFixedCount;
217 quantityMinCount = quantityMaxCount = 1;
218 }
219
220 PatternTerm(bool bolAnchor, bool eolAnchor)
221 : type(TypeDotStarEnclosure)
222 , m_capture(false)
223 , m_invert(false)
224 {
225 anchors.bolAnchor = bolAnchor;
226 anchors.eolAnchor = eolAnchor;
227 quantityType = QuantifierFixedCount;
228 quantityMinCount = quantityMaxCount = 1;
229 }
230
231 static PatternTerm ForwardReference()
232 {
233 return PatternTerm(TypeForwardReference);
234 }
235
236 static PatternTerm BOL()
237 {
238 return PatternTerm(TypeAssertionBOL);
239 }
240
241 static PatternTerm EOL()
242 {
243 return PatternTerm(TypeAssertionEOL);
244 }
245
246 static PatternTerm WordBoundary(bool invert)
247 {
248 return PatternTerm(TypeAssertionWordBoundary, invert);
249 }
250
251 bool invert() const
252 {
253 return m_invert;
254 }
255
256 bool capture()
257 {
258 return m_capture;
259 }
260
261 bool containsAnyCaptures()
262 {
263 ASSERT(this->type == TypeParenthesesSubpattern);
264 return parentheses.lastSubpatternId >= parentheses.subpatternId;
265 }
266
267 void quantify(unsigned count, QuantifierType type)
268 {
269 quantityMinCount = 0;
270 quantityMaxCount = count;
271 quantityType = type;
272 }
273
274 void quantify(unsigned minCount, unsigned maxCount, QuantifierType type)
275 {
276 // Currently only Parentheses can specify a non-zero min with a different max.
277 ASSERT(this->type == TypeParenthesesSubpattern || !minCount || minCount == maxCount);
278 ASSERT(minCount <= maxCount);
279 quantityMinCount = minCount;
280 quantityMaxCount = maxCount;
281 quantityType = type;
282 }
283
284 void dumpQuantifier(PrintStream&);
285 void dump(PrintStream&, YarrPattern*, unsigned);
286};
287
288struct PatternAlternative {
289 WTF_MAKE_FAST_ALLOCATED;
290public:
291 PatternAlternative(PatternDisjunction* disjunction)
292 : m_parent(disjunction)
293 , m_onceThrough(false)
294 , m_hasFixedSize(false)
295 , m_startsWithBOL(false)
296 , m_containsBOL(false)
297 {
298 }
299
300 PatternTerm& lastTerm()
301 {
302 ASSERT(m_terms.size());
303 return m_terms[m_terms.size() - 1];
304 }
305
306 void removeLastTerm()
307 {
308 ASSERT(m_terms.size());
309 m_terms.shrink(m_terms.size() - 1);
310 }
311
312 void setOnceThrough()
313 {
314 m_onceThrough = true;
315 }
316
317 bool onceThrough()
318 {
319 return m_onceThrough;
320 }
321
322 void dump(PrintStream&, YarrPattern*, unsigned);
323
324 Vector<PatternTerm> m_terms;
325 PatternDisjunction* m_parent;
326 unsigned m_minimumSize;
327 bool m_onceThrough : 1;
328 bool m_hasFixedSize : 1;
329 bool m_startsWithBOL : 1;
330 bool m_containsBOL : 1;
331};
332
333struct PatternDisjunction {
334 WTF_MAKE_FAST_ALLOCATED;
335public:
336 PatternDisjunction(PatternAlternative* parent = 0)
337 : m_parent(parent)
338 , m_hasFixedSize(false)
339 {
340 }
341
342 PatternAlternative* addNewAlternative()
343 {
344 m_alternatives.append(std::make_unique<PatternAlternative>(this));
345 return static_cast<PatternAlternative*>(m_alternatives.last().get());
346 }
347
348 void dump(PrintStream&, YarrPattern*, unsigned);
349
350 Vector<std::unique_ptr<PatternAlternative>> m_alternatives;
351 PatternAlternative* m_parent;
352 unsigned m_minimumSize;
353 unsigned m_callFrameSize;
354 bool m_hasFixedSize;
355};
356
357// You probably don't want to be calling these functions directly
358// (please to be calling newlineCharacterClass() et al on your
359// friendly neighborhood YarrPattern instance to get nicely
360// cached copies).
361
362std::unique_ptr<CharacterClass> anycharCreate();
363std::unique_ptr<CharacterClass> newlineCreate();
364std::unique_ptr<CharacterClass> digitsCreate();
365std::unique_ptr<CharacterClass> spacesCreate();
366std::unique_ptr<CharacterClass> wordcharCreate();
367std::unique_ptr<CharacterClass> wordUnicodeIgnoreCaseCharCreate();
368std::unique_ptr<CharacterClass> nondigitsCreate();
369std::unique_ptr<CharacterClass> nonspacesCreate();
370std::unique_ptr<CharacterClass> nonwordcharCreate();
371std::unique_ptr<CharacterClass> nonwordUnicodeIgnoreCaseCharCreate();
372
373struct TermChain {
374 TermChain(PatternTerm term)
375 : term(term)
376 {}
377
378 PatternTerm term;
379 Vector<TermChain> hotTerms;
380};
381
382
383struct YarrPattern {
384 JS_EXPORT_PRIVATE YarrPattern(const String& pattern, OptionSet<Flags>, ErrorCode&, void* stackLimit = nullptr);
385
386 void resetForReparsing()
387 {
388 m_numSubpatterns = 0;
389 m_maxBackReference = 0;
390 m_initialStartValueFrameLocation = 0;
391
392 m_containsBackreferences = false;
393 m_containsBOL = false;
394 m_containsUnsignedLengthPattern = false;
395 m_hasCopiedParenSubexpressions = false;
396 m_saveInitialStartValue = false;
397
398 anycharCached = nullptr;
399 newlineCached = nullptr;
400 digitsCached = nullptr;
401 spacesCached = nullptr;
402 wordcharCached = nullptr;
403 wordUnicodeIgnoreCaseCharCached = nullptr;
404 nondigitsCached = nullptr;
405 nonspacesCached = nullptr;
406 nonwordcharCached = nullptr;
407 nonwordUnicodeIgnoreCasecharCached = nullptr;
408 unicodePropertiesCached.clear();
409
410 m_disjunctions.clear();
411 m_userCharacterClasses.clear();
412 m_captureGroupNames.shrink(0);
413 m_namedForwardReferences.shrink(0);
414 }
415
416 bool containsIllegalBackReference()
417 {
418 return m_maxBackReference > m_numSubpatterns;
419 }
420
421 bool containsIllegalNamedForwardReferences()
422 {
423 if (m_namedForwardReferences.isEmpty())
424 return false;
425
426 for (auto& entry : m_namedForwardReferences) {
427 if (!m_captureGroupNames.contains(entry))
428 return true;
429 }
430
431 return false;
432 }
433
434 bool containsUnsignedLengthPattern()
435 {
436 return m_containsUnsignedLengthPattern;
437 }
438
439 CharacterClass* anyCharacterClass()
440 {
441 if (!anycharCached) {
442 m_userCharacterClasses.append(anycharCreate());
443 anycharCached = m_userCharacterClasses.last().get();
444 }
445 return anycharCached;
446 }
447 CharacterClass* newlineCharacterClass()
448 {
449 if (!newlineCached) {
450 m_userCharacterClasses.append(newlineCreate());
451 newlineCached = m_userCharacterClasses.last().get();
452 }
453 return newlineCached;
454 }
455 CharacterClass* digitsCharacterClass()
456 {
457 if (!digitsCached) {
458 m_userCharacterClasses.append(digitsCreate());
459 digitsCached = m_userCharacterClasses.last().get();
460 }
461 return digitsCached;
462 }
463 CharacterClass* spacesCharacterClass()
464 {
465 if (!spacesCached) {
466 m_userCharacterClasses.append(spacesCreate());
467 spacesCached = m_userCharacterClasses.last().get();
468 }
469 return spacesCached;
470 }
471 CharacterClass* wordcharCharacterClass()
472 {
473 if (!wordcharCached) {
474 m_userCharacterClasses.append(wordcharCreate());
475 wordcharCached = m_userCharacterClasses.last().get();
476 }
477 return wordcharCached;
478 }
479 CharacterClass* wordUnicodeIgnoreCaseCharCharacterClass()
480 {
481 if (!wordUnicodeIgnoreCaseCharCached) {
482 m_userCharacterClasses.append(wordUnicodeIgnoreCaseCharCreate());
483 wordUnicodeIgnoreCaseCharCached = m_userCharacterClasses.last().get();
484 }
485 return wordUnicodeIgnoreCaseCharCached;
486 }
487 CharacterClass* nondigitsCharacterClass()
488 {
489 if (!nondigitsCached) {
490 m_userCharacterClasses.append(nondigitsCreate());
491 nondigitsCached = m_userCharacterClasses.last().get();
492 }
493 return nondigitsCached;
494 }
495 CharacterClass* nonspacesCharacterClass()
496 {
497 if (!nonspacesCached) {
498 m_userCharacterClasses.append(nonspacesCreate());
499 nonspacesCached = m_userCharacterClasses.last().get();
500 }
501 return nonspacesCached;
502 }
503 CharacterClass* nonwordcharCharacterClass()
504 {
505 if (!nonwordcharCached) {
506 m_userCharacterClasses.append(nonwordcharCreate());
507 nonwordcharCached = m_userCharacterClasses.last().get();
508 }
509 return nonwordcharCached;
510 }
511 CharacterClass* nonwordUnicodeIgnoreCaseCharCharacterClass()
512 {
513 if (!nonwordUnicodeIgnoreCasecharCached) {
514 m_userCharacterClasses.append(nonwordUnicodeIgnoreCaseCharCreate());
515 nonwordUnicodeIgnoreCasecharCached = m_userCharacterClasses.last().get();
516 }
517 return nonwordUnicodeIgnoreCasecharCached;
518 }
519 CharacterClass* unicodeCharacterClassFor(BuiltInCharacterClassID unicodeClassID)
520 {
521 ASSERT(unicodeClassID >= BuiltInCharacterClassID::BaseUnicodePropertyID);
522
523 unsigned classID = static_cast<unsigned>(unicodeClassID);
524
525 if (unicodePropertiesCached.find(classID) == unicodePropertiesCached.end()) {
526 m_userCharacterClasses.append(createUnicodeCharacterClassFor(unicodeClassID));
527 CharacterClass* result = m_userCharacterClasses.last().get();
528 unicodePropertiesCached.add(classID, result);
529 return result;
530 }
531
532 return unicodePropertiesCached.get(classID);
533 }
534
535 void dumpPatternString(PrintStream& out, const String& patternString);
536 void dumpPattern(const String& pattern);
537 void dumpPattern(PrintStream& out, const String& pattern);
538
539 bool global() const { return m_flags.contains(Flags::Global); }
540 bool ignoreCase() const { return m_flags.contains(Flags::IgnoreCase); }
541 bool multiline() const { return m_flags.contains(Flags::Multiline); }
542 bool sticky() const { return m_flags.contains(Flags::Sticky); }
543 bool unicode() const { return m_flags.contains(Flags::Unicode); }
544 bool dotAll() const { return m_flags.contains(Flags::DotAll); }
545
546 bool m_containsBackreferences : 1;
547 bool m_containsBOL : 1;
548 bool m_containsUnsignedLengthPattern : 1;
549 bool m_hasCopiedParenSubexpressions : 1;
550 bool m_saveInitialStartValue : 1;
551 OptionSet<Flags> m_flags;
552 unsigned m_numSubpatterns { 0 };
553 unsigned m_maxBackReference { 0 };
554 unsigned m_initialStartValueFrameLocation { 0 };
555 PatternDisjunction* m_body;
556 Vector<std::unique_ptr<PatternDisjunction>, 4> m_disjunctions;
557 Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses;
558 Vector<String> m_captureGroupNames;
559 Vector<String> m_namedForwardReferences;
560 HashMap<String, unsigned> m_namedGroupToParenIndex;
561
562private:
563 ErrorCode compile(const String& patternString, void* stackLimit);
564
565 CharacterClass* anycharCached { nullptr };
566 CharacterClass* newlineCached { nullptr };
567 CharacterClass* digitsCached { nullptr };
568 CharacterClass* spacesCached { nullptr };
569 CharacterClass* wordcharCached { nullptr };
570 CharacterClass* wordUnicodeIgnoreCaseCharCached { nullptr };
571 CharacterClass* nondigitsCached { nullptr };
572 CharacterClass* nonspacesCached { nullptr };
573 CharacterClass* nonwordcharCached { nullptr };
574 CharacterClass* nonwordUnicodeIgnoreCasecharCached { nullptr };
575 HashMap<unsigned, CharacterClass*> unicodePropertiesCached;
576};
577
578 void indentForNestingLevel(PrintStream&, unsigned);
579 void dumpUChar32(PrintStream&, UChar32);
580 void dumpCharacterClass(PrintStream&, YarrPattern*, CharacterClass*);
581
582 struct BackTrackInfoPatternCharacter {
583 uintptr_t begin; // Only needed for unicode patterns
584 uintptr_t matchAmount;
585
586 static unsigned beginIndex() { return offsetof(BackTrackInfoPatternCharacter, begin) / sizeof(uintptr_t); }
587 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoPatternCharacter, matchAmount) / sizeof(uintptr_t); }
588 };
589
590 struct BackTrackInfoCharacterClass {
591 uintptr_t begin; // Only needed for unicode patterns
592 uintptr_t matchAmount;
593
594 static unsigned beginIndex() { return offsetof(BackTrackInfoCharacterClass, begin) / sizeof(uintptr_t); }
595 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoCharacterClass, matchAmount) / sizeof(uintptr_t); }
596 };
597
598 struct BackTrackInfoBackReference {
599 uintptr_t begin; // Not really needed for greedy quantifiers.
600 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
601
602 static unsigned beginIndex() { return offsetof(BackTrackInfoBackReference, begin) / sizeof(uintptr_t); }
603 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoBackReference, matchAmount) / sizeof(uintptr_t); }
604 };
605
606 struct BackTrackInfoAlternative {
607 union {
608 uintptr_t offset;
609 };
610 };
611
612 struct BackTrackInfoParentheticalAssertion {
613 uintptr_t begin;
614
615 static unsigned beginIndex() { return offsetof(BackTrackInfoParentheticalAssertion, begin) / sizeof(uintptr_t); }
616 };
617
618 struct BackTrackInfoParenthesesOnce {
619 uintptr_t begin;
620 uintptr_t returnAddress;
621
622 static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesOnce, begin) / sizeof(uintptr_t); }
623 static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParenthesesOnce, returnAddress) / sizeof(uintptr_t); }
624 };
625
626 struct BackTrackInfoParenthesesTerminal {
627 uintptr_t begin;
628
629 static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesTerminal, begin) / sizeof(uintptr_t); }
630 };
631
632 struct BackTrackInfoParentheses {
633 uintptr_t begin;
634 uintptr_t returnAddress;
635 uintptr_t matchAmount;
636 uintptr_t parenContextHead;
637
638 static unsigned beginIndex() { return offsetof(BackTrackInfoParentheses, begin) / sizeof(uintptr_t); }
639 static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParentheses, returnAddress) / sizeof(uintptr_t); }
640 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoParentheses, matchAmount) / sizeof(uintptr_t); }
641 static unsigned parenContextHeadIndex() { return offsetof(BackTrackInfoParentheses, parenContextHead) / sizeof(uintptr_t); }
642 };
643
644} } // namespace JSC::Yarr
645