1/*
2 * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#pragma once
27
28#include "ConcurrentJSLock.h"
29#include "YarrErrorCode.h"
30#include "YarrFlags.h"
31#include "YarrPattern.h"
32
33namespace WTF {
34class BumpPointerAllocator;
35}
36using WTF::BumpPointerAllocator;
37
38namespace JSC { namespace Yarr {
39
40class ByteDisjunction;
41
42struct ByteTerm {
43 union {
44 struct {
45 union {
46 UChar32 patternCharacter;
47 struct {
48 UChar32 lo;
49 UChar32 hi;
50 } casedCharacter;
51 CharacterClass* characterClass;
52 unsigned subpatternId;
53 };
54 union {
55 ByteDisjunction* parenthesesDisjunction;
56 unsigned parenthesesWidth;
57 };
58 QuantifierType quantityType;
59 unsigned quantityMinCount;
60 unsigned quantityMaxCount;
61 } atom;
62 struct {
63 int next;
64 int end;
65 bool onceThrough;
66 } alternative;
67 struct {
68 bool m_bol : 1;
69 bool m_eol : 1;
70 } anchors;
71 unsigned checkInputCount;
72 };
73 unsigned frameLocation;
74 enum Type : uint8_t {
75 TypeBodyAlternativeBegin,
76 TypeBodyAlternativeDisjunction,
77 TypeBodyAlternativeEnd,
78 TypeAlternativeBegin,
79 TypeAlternativeDisjunction,
80 TypeAlternativeEnd,
81 TypeSubpatternBegin,
82 TypeSubpatternEnd,
83 TypeAssertionBOL,
84 TypeAssertionEOL,
85 TypeAssertionWordBoundary,
86 TypePatternCharacterOnce,
87 TypePatternCharacterFixed,
88 TypePatternCharacterGreedy,
89 TypePatternCharacterNonGreedy,
90 TypePatternCasedCharacterOnce,
91 TypePatternCasedCharacterFixed,
92 TypePatternCasedCharacterGreedy,
93 TypePatternCasedCharacterNonGreedy,
94 TypeCharacterClass,
95 TypeBackReference,
96 TypeParenthesesSubpattern,
97 TypeParenthesesSubpatternOnceBegin,
98 TypeParenthesesSubpatternOnceEnd,
99 TypeParenthesesSubpatternTerminalBegin,
100 TypeParenthesesSubpatternTerminalEnd,
101 TypeParentheticalAssertionBegin,
102 TypeParentheticalAssertionEnd,
103 TypeCheckInput,
104 TypeUncheckInput,
105 TypeDotStarEnclosure,
106 } type;
107 bool m_capture : 1;
108 bool m_invert : 1;
109 unsigned inputPosition;
110
111 ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
112 : frameLocation(frameLocation)
113 , m_capture(false)
114 , m_invert(false)
115 {
116 atom.patternCharacter = ch;
117 atom.quantityType = quantityType;
118 atom.quantityMinCount = quantityCount.unsafeGet();
119 atom.quantityMaxCount = quantityCount.unsafeGet();
120 inputPosition = inputPos;
121
122 switch (quantityType) {
123 case QuantifierFixedCount:
124 type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
125 break;
126 case QuantifierGreedy:
127 type = ByteTerm::TypePatternCharacterGreedy;
128 break;
129 case QuantifierNonGreedy:
130 type = ByteTerm::TypePatternCharacterNonGreedy;
131 break;
132 }
133 }
134
135 ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
136 : frameLocation(frameLocation)
137 , m_capture(false)
138 , m_invert(false)
139 {
140 switch (quantityType) {
141 case QuantifierFixedCount:
142 type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
143 break;
144 case QuantifierGreedy:
145 type = ByteTerm::TypePatternCasedCharacterGreedy;
146 break;
147 case QuantifierNonGreedy:
148 type = ByteTerm::TypePatternCasedCharacterNonGreedy;
149 break;
150 }
151
152 atom.casedCharacter.lo = lo;
153 atom.casedCharacter.hi = hi;
154 atom.quantityType = quantityType;
155 atom.quantityMinCount = quantityCount.unsafeGet();
156 atom.quantityMaxCount = quantityCount.unsafeGet();
157 inputPosition = inputPos;
158 }
159
160 ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos)
161 : type(ByteTerm::TypeCharacterClass)
162 , m_capture(false)
163 , m_invert(invert)
164 {
165 atom.characterClass = characterClass;
166 atom.quantityType = QuantifierFixedCount;
167 atom.quantityMinCount = 1;
168 atom.quantityMaxCount = 1;
169 inputPosition = inputPos;
170 }
171
172 ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos)
173 : type(type)
174 , m_capture(capture)
175 , m_invert(false)
176 {
177 atom.subpatternId = subpatternId;
178 atom.parenthesesDisjunction = parenthesesInfo;
179 atom.quantityType = QuantifierFixedCount;
180 atom.quantityMinCount = 1;
181 atom.quantityMaxCount = 1;
182 inputPosition = inputPos;
183 }
184
185 ByteTerm(Type type, bool invert = false)
186 : type(type)
187 , m_capture(false)
188 , m_invert(invert)
189 {
190 atom.quantityType = QuantifierFixedCount;
191 atom.quantityMinCount = 1;
192 atom.quantityMaxCount = 1;
193 }
194
195 ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos)
196 : type(type)
197 , m_capture(capture)
198 , m_invert(invert)
199 {
200 atom.subpatternId = subpatternId;
201 atom.quantityType = QuantifierFixedCount;
202 atom.quantityMinCount = 1;
203 atom.quantityMaxCount = 1;
204 inputPosition = inputPos;
205 }
206
207 static ByteTerm BOL(unsigned inputPos)
208 {
209 ByteTerm term(TypeAssertionBOL);
210 term.inputPosition = inputPos;
211 return term;
212 }
213
214 static ByteTerm CheckInput(Checked<unsigned> count)
215 {
216 ByteTerm term(TypeCheckInput);
217 term.checkInputCount = count.unsafeGet();
218 return term;
219 }
220
221 static ByteTerm UncheckInput(Checked<unsigned> count)
222 {
223 ByteTerm term(TypeUncheckInput);
224 term.checkInputCount = count.unsafeGet();
225 return term;
226 }
227
228 static ByteTerm EOL(unsigned inputPos)
229 {
230 ByteTerm term(TypeAssertionEOL);
231 term.inputPosition = inputPos;
232 return term;
233 }
234
235 static ByteTerm WordBoundary(bool invert, unsigned inputPos)
236 {
237 ByteTerm term(TypeAssertionWordBoundary, invert);
238 term.inputPosition = inputPos;
239 return term;
240 }
241
242 static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos)
243 {
244 return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
245 }
246
247 static ByteTerm BodyAlternativeBegin(bool onceThrough)
248 {
249 ByteTerm term(TypeBodyAlternativeBegin);
250 term.alternative.next = 0;
251 term.alternative.end = 0;
252 term.alternative.onceThrough = onceThrough;
253 return term;
254 }
255
256 static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
257 {
258 ByteTerm term(TypeBodyAlternativeDisjunction);
259 term.alternative.next = 0;
260 term.alternative.end = 0;
261 term.alternative.onceThrough = onceThrough;
262 return term;
263 }
264
265 static ByteTerm BodyAlternativeEnd()
266 {
267 ByteTerm term(TypeBodyAlternativeEnd);
268 term.alternative.next = 0;
269 term.alternative.end = 0;
270 term.alternative.onceThrough = false;
271 return term;
272 }
273
274 static ByteTerm AlternativeBegin()
275 {
276 ByteTerm term(TypeAlternativeBegin);
277 term.alternative.next = 0;
278 term.alternative.end = 0;
279 term.alternative.onceThrough = false;
280 return term;
281 }
282
283 static ByteTerm AlternativeDisjunction()
284 {
285 ByteTerm term(TypeAlternativeDisjunction);
286 term.alternative.next = 0;
287 term.alternative.end = 0;
288 term.alternative.onceThrough = false;
289 return term;
290 }
291
292 static ByteTerm AlternativeEnd()
293 {
294 ByteTerm term(TypeAlternativeEnd);
295 term.alternative.next = 0;
296 term.alternative.end = 0;
297 term.alternative.onceThrough = false;
298 return term;
299 }
300
301 static ByteTerm SubpatternBegin()
302 {
303 return ByteTerm(TypeSubpatternBegin);
304 }
305
306 static ByteTerm SubpatternEnd()
307 {
308 return ByteTerm(TypeSubpatternEnd);
309 }
310
311 static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
312 {
313 ByteTerm term(TypeDotStarEnclosure);
314 term.anchors.m_bol = bolAnchor;
315 term.anchors.m_eol = eolAnchor;
316 return term;
317 }
318
319 bool invert()
320 {
321 return m_invert;
322 }
323
324 bool capture()
325 {
326 return m_capture;
327 }
328};
329
330class ByteDisjunction {
331 WTF_MAKE_FAST_ALLOCATED;
332public:
333 ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
334 : m_numSubpatterns(numSubpatterns)
335 , m_frameSize(frameSize)
336 {
337 }
338
339 size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); }
340
341 Vector<ByteTerm> terms;
342 unsigned m_numSubpatterns;
343 unsigned m_frameSize;
344};
345
346struct BytecodePattern {
347 WTF_MAKE_FAST_ALLOCATED;
348public:
349 BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
350 : m_body(WTFMove(body))
351 , m_flags(pattern.m_flags)
352 , m_allocator(allocator)
353 , m_lock(lock)
354 {
355 m_body->terms.shrinkToFit();
356
357 newlineCharacterClass = pattern.newlineCharacterClass();
358 if (unicode() && ignoreCase())
359 wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
360 else
361 wordcharCharacterClass = pattern.wordcharCharacterClass();
362
363 m_allParenthesesInfo.swap(parenthesesInfoToAdopt);
364 m_allParenthesesInfo.shrinkToFit();
365
366 m_userCharacterClasses.swap(pattern.m_userCharacterClasses);
367 m_userCharacterClasses.shrinkToFit();
368 }
369
370 size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); }
371
372 bool ignoreCase() const { return m_flags.contains(Flags::IgnoreCase); }
373 bool multiline() const { return m_flags.contains(Flags::Multiline); }
374 bool sticky() const { return m_flags.contains(Flags::Sticky); }
375 bool unicode() const { return m_flags.contains(Flags::Unicode); }
376 bool dotAll() const { return m_flags.contains(Flags::DotAll); }
377
378 std::unique_ptr<ByteDisjunction> m_body;
379 OptionSet<Flags> m_flags;
380 // Each BytecodePattern is associated with a RegExp, each RegExp is associated
381 // with a VM. Cache a pointer to our VM's m_regExpAllocator.
382 BumpPointerAllocator* m_allocator;
383 ConcurrentJSLock* m_lock;
384
385 CharacterClass* newlineCharacterClass;
386 CharacterClass* wordcharCharacterClass;
387
388private:
389 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
390 Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses;
391};
392
393JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ErrorCode&, ConcurrentJSLock* = nullptr);
394JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output);
395unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
396unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
397
398} } // namespace JSC::Yarr
399