1 | /* |
2 | * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #pragma once |
27 | |
28 | #include "ConcurrentJSLock.h" |
29 | #include "YarrErrorCode.h" |
30 | #include "YarrFlags.h" |
31 | #include "YarrPattern.h" |
32 | |
33 | namespace WTF { |
34 | class BumpPointerAllocator; |
35 | } |
36 | using WTF::BumpPointerAllocator; |
37 | |
38 | namespace JSC { namespace Yarr { |
39 | |
40 | class ByteDisjunction; |
41 | |
42 | struct ByteTerm { |
43 | union { |
44 | struct { |
45 | union { |
46 | UChar32 patternCharacter; |
47 | struct { |
48 | UChar32 lo; |
49 | UChar32 hi; |
50 | } casedCharacter; |
51 | CharacterClass* characterClass; |
52 | unsigned subpatternId; |
53 | }; |
54 | union { |
55 | ByteDisjunction* parenthesesDisjunction; |
56 | unsigned parenthesesWidth; |
57 | }; |
58 | QuantifierType quantityType; |
59 | unsigned quantityMinCount; |
60 | unsigned quantityMaxCount; |
61 | } atom; |
62 | struct { |
63 | int next; |
64 | int end; |
65 | bool onceThrough; |
66 | } alternative; |
67 | struct { |
68 | bool m_bol : 1; |
69 | bool m_eol : 1; |
70 | } anchors; |
71 | unsigned checkInputCount; |
72 | }; |
73 | unsigned frameLocation; |
74 | enum Type : uint8_t { |
75 | TypeBodyAlternativeBegin, |
76 | TypeBodyAlternativeDisjunction, |
77 | TypeBodyAlternativeEnd, |
78 | TypeAlternativeBegin, |
79 | TypeAlternativeDisjunction, |
80 | TypeAlternativeEnd, |
81 | TypeSubpatternBegin, |
82 | TypeSubpatternEnd, |
83 | TypeAssertionBOL, |
84 | TypeAssertionEOL, |
85 | TypeAssertionWordBoundary, |
86 | TypePatternCharacterOnce, |
87 | TypePatternCharacterFixed, |
88 | TypePatternCharacterGreedy, |
89 | TypePatternCharacterNonGreedy, |
90 | TypePatternCasedCharacterOnce, |
91 | TypePatternCasedCharacterFixed, |
92 | TypePatternCasedCharacterGreedy, |
93 | TypePatternCasedCharacterNonGreedy, |
94 | TypeCharacterClass, |
95 | TypeBackReference, |
96 | TypeParenthesesSubpattern, |
97 | TypeParenthesesSubpatternOnceBegin, |
98 | TypeParenthesesSubpatternOnceEnd, |
99 | TypeParenthesesSubpatternTerminalBegin, |
100 | TypeParenthesesSubpatternTerminalEnd, |
101 | TypeParentheticalAssertionBegin, |
102 | TypeParentheticalAssertionEnd, |
103 | TypeCheckInput, |
104 | TypeUncheckInput, |
105 | TypeDotStarEnclosure, |
106 | } type; |
107 | bool m_capture : 1; |
108 | bool m_invert : 1; |
109 | unsigned inputPosition; |
110 | |
111 | ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
112 | : frameLocation(frameLocation) |
113 | , m_capture(false) |
114 | , m_invert(false) |
115 | { |
116 | atom.patternCharacter = ch; |
117 | atom.quantityType = quantityType; |
118 | atom.quantityMinCount = quantityCount.unsafeGet(); |
119 | atom.quantityMaxCount = quantityCount.unsafeGet(); |
120 | inputPosition = inputPos; |
121 | |
122 | switch (quantityType) { |
123 | case QuantifierFixedCount: |
124 | type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; |
125 | break; |
126 | case QuantifierGreedy: |
127 | type = ByteTerm::TypePatternCharacterGreedy; |
128 | break; |
129 | case QuantifierNonGreedy: |
130 | type = ByteTerm::TypePatternCharacterNonGreedy; |
131 | break; |
132 | } |
133 | } |
134 | |
135 | ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
136 | : frameLocation(frameLocation) |
137 | , m_capture(false) |
138 | , m_invert(false) |
139 | { |
140 | switch (quantityType) { |
141 | case QuantifierFixedCount: |
142 | type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; |
143 | break; |
144 | case QuantifierGreedy: |
145 | type = ByteTerm::TypePatternCasedCharacterGreedy; |
146 | break; |
147 | case QuantifierNonGreedy: |
148 | type = ByteTerm::TypePatternCasedCharacterNonGreedy; |
149 | break; |
150 | } |
151 | |
152 | atom.casedCharacter.lo = lo; |
153 | atom.casedCharacter.hi = hi; |
154 | atom.quantityType = quantityType; |
155 | atom.quantityMinCount = quantityCount.unsafeGet(); |
156 | atom.quantityMaxCount = quantityCount.unsafeGet(); |
157 | inputPosition = inputPos; |
158 | } |
159 | |
160 | ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos) |
161 | : type(ByteTerm::TypeCharacterClass) |
162 | , m_capture(false) |
163 | , m_invert(invert) |
164 | { |
165 | atom.characterClass = characterClass; |
166 | atom.quantityType = QuantifierFixedCount; |
167 | atom.quantityMinCount = 1; |
168 | atom.quantityMaxCount = 1; |
169 | inputPosition = inputPos; |
170 | } |
171 | |
172 | ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos) |
173 | : type(type) |
174 | , m_capture(capture) |
175 | , m_invert(false) |
176 | { |
177 | atom.subpatternId = subpatternId; |
178 | atom.parenthesesDisjunction = parenthesesInfo; |
179 | atom.quantityType = QuantifierFixedCount; |
180 | atom.quantityMinCount = 1; |
181 | atom.quantityMaxCount = 1; |
182 | inputPosition = inputPos; |
183 | } |
184 | |
185 | ByteTerm(Type type, bool invert = false) |
186 | : type(type) |
187 | , m_capture(false) |
188 | , m_invert(invert) |
189 | { |
190 | atom.quantityType = QuantifierFixedCount; |
191 | atom.quantityMinCount = 1; |
192 | atom.quantityMaxCount = 1; |
193 | } |
194 | |
195 | ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos) |
196 | : type(type) |
197 | , m_capture(capture) |
198 | , m_invert(invert) |
199 | { |
200 | atom.subpatternId = subpatternId; |
201 | atom.quantityType = QuantifierFixedCount; |
202 | atom.quantityMinCount = 1; |
203 | atom.quantityMaxCount = 1; |
204 | inputPosition = inputPos; |
205 | } |
206 | |
207 | static ByteTerm BOL(unsigned inputPos) |
208 | { |
209 | ByteTerm term(TypeAssertionBOL); |
210 | term.inputPosition = inputPos; |
211 | return term; |
212 | } |
213 | |
214 | static ByteTerm CheckInput(Checked<unsigned> count) |
215 | { |
216 | ByteTerm term(TypeCheckInput); |
217 | term.checkInputCount = count.unsafeGet(); |
218 | return term; |
219 | } |
220 | |
221 | static ByteTerm UncheckInput(Checked<unsigned> count) |
222 | { |
223 | ByteTerm term(TypeUncheckInput); |
224 | term.checkInputCount = count.unsafeGet(); |
225 | return term; |
226 | } |
227 | |
228 | static ByteTerm EOL(unsigned inputPos) |
229 | { |
230 | ByteTerm term(TypeAssertionEOL); |
231 | term.inputPosition = inputPos; |
232 | return term; |
233 | } |
234 | |
235 | static ByteTerm WordBoundary(bool invert, unsigned inputPos) |
236 | { |
237 | ByteTerm term(TypeAssertionWordBoundary, invert); |
238 | term.inputPosition = inputPos; |
239 | return term; |
240 | } |
241 | |
242 | static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos) |
243 | { |
244 | return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); |
245 | } |
246 | |
247 | static ByteTerm BodyAlternativeBegin(bool onceThrough) |
248 | { |
249 | ByteTerm term(TypeBodyAlternativeBegin); |
250 | term.alternative.next = 0; |
251 | term.alternative.end = 0; |
252 | term.alternative.onceThrough = onceThrough; |
253 | return term; |
254 | } |
255 | |
256 | static ByteTerm BodyAlternativeDisjunction(bool onceThrough) |
257 | { |
258 | ByteTerm term(TypeBodyAlternativeDisjunction); |
259 | term.alternative.next = 0; |
260 | term.alternative.end = 0; |
261 | term.alternative.onceThrough = onceThrough; |
262 | return term; |
263 | } |
264 | |
265 | static ByteTerm BodyAlternativeEnd() |
266 | { |
267 | ByteTerm term(TypeBodyAlternativeEnd); |
268 | term.alternative.next = 0; |
269 | term.alternative.end = 0; |
270 | term.alternative.onceThrough = false; |
271 | return term; |
272 | } |
273 | |
274 | static ByteTerm AlternativeBegin() |
275 | { |
276 | ByteTerm term(TypeAlternativeBegin); |
277 | term.alternative.next = 0; |
278 | term.alternative.end = 0; |
279 | term.alternative.onceThrough = false; |
280 | return term; |
281 | } |
282 | |
283 | static ByteTerm AlternativeDisjunction() |
284 | { |
285 | ByteTerm term(TypeAlternativeDisjunction); |
286 | term.alternative.next = 0; |
287 | term.alternative.end = 0; |
288 | term.alternative.onceThrough = false; |
289 | return term; |
290 | } |
291 | |
292 | static ByteTerm AlternativeEnd() |
293 | { |
294 | ByteTerm term(TypeAlternativeEnd); |
295 | term.alternative.next = 0; |
296 | term.alternative.end = 0; |
297 | term.alternative.onceThrough = false; |
298 | return term; |
299 | } |
300 | |
301 | static ByteTerm SubpatternBegin() |
302 | { |
303 | return ByteTerm(TypeSubpatternBegin); |
304 | } |
305 | |
306 | static ByteTerm SubpatternEnd() |
307 | { |
308 | return ByteTerm(TypeSubpatternEnd); |
309 | } |
310 | |
311 | static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) |
312 | { |
313 | ByteTerm term(TypeDotStarEnclosure); |
314 | term.anchors.m_bol = bolAnchor; |
315 | term.anchors.m_eol = eolAnchor; |
316 | return term; |
317 | } |
318 | |
319 | bool invert() |
320 | { |
321 | return m_invert; |
322 | } |
323 | |
324 | bool capture() |
325 | { |
326 | return m_capture; |
327 | } |
328 | }; |
329 | |
330 | class ByteDisjunction { |
331 | WTF_MAKE_FAST_ALLOCATED; |
332 | public: |
333 | ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) |
334 | : m_numSubpatterns(numSubpatterns) |
335 | , m_frameSize(frameSize) |
336 | { |
337 | } |
338 | |
339 | size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); } |
340 | |
341 | Vector<ByteTerm> terms; |
342 | unsigned m_numSubpatterns; |
343 | unsigned m_frameSize; |
344 | }; |
345 | |
346 | struct BytecodePattern { |
347 | WTF_MAKE_FAST_ALLOCATED; |
348 | public: |
349 | BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) |
350 | : m_body(WTFMove(body)) |
351 | , m_flags(pattern.m_flags) |
352 | , m_allocator(allocator) |
353 | , m_lock(lock) |
354 | { |
355 | m_body->terms.shrinkToFit(); |
356 | |
357 | newlineCharacterClass = pattern.newlineCharacterClass(); |
358 | if (unicode() && ignoreCase()) |
359 | wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass(); |
360 | else |
361 | wordcharCharacterClass = pattern.wordcharCharacterClass(); |
362 | |
363 | m_allParenthesesInfo.swap(parenthesesInfoToAdopt); |
364 | m_allParenthesesInfo.shrinkToFit(); |
365 | |
366 | m_userCharacterClasses.swap(pattern.m_userCharacterClasses); |
367 | m_userCharacterClasses.shrinkToFit(); |
368 | } |
369 | |
370 | size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); } |
371 | |
372 | bool ignoreCase() const { return m_flags.contains(Flags::IgnoreCase); } |
373 | bool multiline() const { return m_flags.contains(Flags::Multiline); } |
374 | bool sticky() const { return m_flags.contains(Flags::Sticky); } |
375 | bool unicode() const { return m_flags.contains(Flags::Unicode); } |
376 | bool dotAll() const { return m_flags.contains(Flags::DotAll); } |
377 | |
378 | std::unique_ptr<ByteDisjunction> m_body; |
379 | OptionSet<Flags> m_flags; |
380 | // Each BytecodePattern is associated with a RegExp, each RegExp is associated |
381 | // with a VM. Cache a pointer to our VM's m_regExpAllocator. |
382 | BumpPointerAllocator* m_allocator; |
383 | ConcurrentJSLock* m_lock; |
384 | |
385 | CharacterClass* newlineCharacterClass; |
386 | CharacterClass* wordcharCharacterClass; |
387 | |
388 | private: |
389 | Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; |
390 | Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; |
391 | }; |
392 | |
393 | JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ErrorCode&, ConcurrentJSLock* = nullptr); |
394 | JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); |
395 | unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); |
396 | unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); |
397 | |
398 | } } // namespace JSC::Yarr |
399 | |