1 | /* |
2 | * Copyright (C) 1999-2001, 2004 Harri Porten ([email protected]) |
3 | * Copyright (c) 2007, 2008, 2016-2017 Apple Inc. All rights reserved. |
4 | * Copyright (C) 2009 Torch Mobile, Inc. |
5 | * Copyright (C) 2010 Peter Varga ([email protected]), University of Szeged |
6 | * |
7 | * This library is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU Lesser General Public |
9 | * License as published by the Free Software Foundation; either |
10 | * version 2 of the License, or (at your option) any later version. |
11 | * |
12 | * This library is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | * Lesser General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU Lesser General Public |
18 | * License along with this library; if not, write to the Free Software |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | * |
21 | */ |
22 | |
23 | #include "config.h" |
24 | #include "RegExp.h" |
25 | |
26 | #include "ExceptionHelpers.h" |
27 | #include "Lexer.h" |
28 | #include "JSCInlines.h" |
29 | #include "RegExpCache.h" |
30 | #include "RegExpInlines.h" |
31 | #include "YarrJIT.h" |
32 | #include <wtf/Assertions.h> |
33 | |
34 | namespace JSC { |
35 | |
36 | const ClassInfo RegExp::s_info = { "RegExp" , nullptr, nullptr, nullptr, CREATE_METHOD_TABLE(RegExp) }; |
37 | |
38 | #if REGEXP_FUNC_TEST_DATA_GEN |
39 | const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData" ; |
40 | RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0; |
41 | |
42 | RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get() |
43 | { |
44 | if (!s_instance) |
45 | s_instance = new RegExpFunctionalTestCollector(); |
46 | |
47 | return s_instance; |
48 | } |
49 | |
50 | void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, const String& s, int startOffset, int* ovector, int result) |
51 | { |
52 | if ((!m_lastRegExp) || (m_lastRegExp != regExp)) { |
53 | m_lastRegExp = regExp; |
54 | fputc('/', m_file); |
55 | outputEscapedString(regExp->pattern(), true); |
56 | fputc('/', m_file); |
57 | if (regExp->global()) |
58 | fputc('g', m_file); |
59 | if (regExp->ignoreCase()) |
60 | fputc('i', m_file); |
61 | if (regExp->multiline()) |
62 | fputc('m', m_file); |
63 | if (regExp->dotAll()) |
64 | fputc('s', m_file); |
65 | if (regExp->unicode()) |
66 | fputc('u', m_file); |
67 | if (regExp->sticky()) |
68 | fputc('y', m_file); |
69 | fprintf(m_file, "\n" ); |
70 | } |
71 | |
72 | fprintf(m_file, " \"" ); |
73 | outputEscapedString(s); |
74 | fprintf(m_file, "\", %d, %d, (" , startOffset, result); |
75 | for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) { |
76 | int subpatternBegin = ovector[i * 2]; |
77 | int subpatternEnd = ovector[i * 2 + 1]; |
78 | if (subpatternBegin == -1) |
79 | subpatternEnd = -1; |
80 | fprintf(m_file, "%d, %d" , subpatternBegin, subpatternEnd); |
81 | if (i < regExp->numSubpatterns()) |
82 | fputs(", " , m_file); |
83 | } |
84 | |
85 | fprintf(m_file, ")\n" ); |
86 | fflush(m_file); |
87 | } |
88 | |
89 | RegExpFunctionalTestCollector::RegExpFunctionalTestCollector() |
90 | { |
91 | m_file = fopen(s_fileName, "r+" ); |
92 | if (!m_file) |
93 | m_file = fopen(s_fileName, "w+" ); |
94 | |
95 | fseek(m_file, 0L, SEEK_END); |
96 | } |
97 | |
98 | RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector() |
99 | { |
100 | fclose(m_file); |
101 | s_instance = 0; |
102 | } |
103 | |
104 | void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash) |
105 | { |
106 | int len = s.length(); |
107 | |
108 | for (int i = 0; i < len; ++i) { |
109 | UChar c = s[i]; |
110 | |
111 | switch (c) { |
112 | case '\0': |
113 | fputs("\\0" , m_file); |
114 | break; |
115 | case '\a': |
116 | fputs("\\a" , m_file); |
117 | break; |
118 | case '\b': |
119 | fputs("\\b" , m_file); |
120 | break; |
121 | case '\f': |
122 | fputs("\\f" , m_file); |
123 | break; |
124 | case '\n': |
125 | fputs("\\n" , m_file); |
126 | break; |
127 | case '\r': |
128 | fputs("\\r" , m_file); |
129 | break; |
130 | case '\t': |
131 | fputs("\\t" , m_file); |
132 | break; |
133 | case '\v': |
134 | fputs("\\v" , m_file); |
135 | break; |
136 | case '/': |
137 | if (escapeSlash) |
138 | fputs("\\/" , m_file); |
139 | else |
140 | fputs("/" , m_file); |
141 | break; |
142 | case '\"': |
143 | fputs("\\\"" , m_file); |
144 | break; |
145 | case '\\': |
146 | fputs("\\\\" , m_file); |
147 | break; |
148 | case '\?': |
149 | fputs("\?" , m_file); |
150 | break; |
151 | default: |
152 | if (c > 0x7f) |
153 | fprintf(m_file, "\\u%04x" , c); |
154 | else |
155 | fputc(c, m_file); |
156 | break; |
157 | } |
158 | } |
159 | } |
160 | #endif |
161 | |
162 | RegExp::RegExp(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags) |
163 | : JSCell(vm, vm.regExpStructure.get()) |
164 | , m_patternString(patternString) |
165 | , m_flags(flags) |
166 | { |
167 | ASSERT(m_flags != Yarr::Flags::DeletedValue); |
168 | } |
169 | |
170 | void RegExp::finishCreation(VM& vm) |
171 | { |
172 | Base::finishCreation(vm); |
173 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm.stackLimit()); |
174 | if (!isValid()) { |
175 | m_state = ParseError; |
176 | return; |
177 | } |
178 | |
179 | m_numSubpatterns = pattern.m_numSubpatterns; |
180 | if (!pattern.m_captureGroupNames.isEmpty() || !pattern.m_namedGroupToParenIndex.isEmpty()) { |
181 | m_rareData = std::make_unique<RareData>(); |
182 | m_rareData->m_captureGroupNames.swap(pattern.m_captureGroupNames); |
183 | m_rareData->m_namedGroupToParenIndex.swap(pattern.m_namedGroupToParenIndex); |
184 | } |
185 | } |
186 | |
187 | void RegExp::destroy(JSCell* cell) |
188 | { |
189 | RegExp* thisObject = static_cast<RegExp*>(cell); |
190 | #if REGEXP_FUNC_TEST_DATA_GEN |
191 | RegExpFunctionalTestCollector::get()->clearRegExp(this); |
192 | #endif |
193 | thisObject->RegExp::~RegExp(); |
194 | } |
195 | |
196 | size_t RegExp::estimatedSize(JSCell* cell, VM& vm) |
197 | { |
198 | RegExp* thisObject = static_cast<RegExp*>(cell); |
199 | size_t regexDataSize = thisObject->m_regExpBytecode ? thisObject->m_regExpBytecode->estimatedSizeInBytes() : 0; |
200 | #if ENABLE(YARR_JIT) |
201 | if (auto* jitCode = thisObject->m_regExpJITCode.get()) |
202 | regexDataSize += jitCode->size(); |
203 | #endif |
204 | return Base::estimatedSize(cell, vm) + regexDataSize; |
205 | } |
206 | |
207 | RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags) |
208 | { |
209 | RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags); |
210 | regExp->finishCreation(vm); |
211 | return regExp; |
212 | } |
213 | |
214 | RegExp* RegExp::create(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags) |
215 | { |
216 | return vm.regExpCache()->lookupOrCreate(patternString, flags); |
217 | } |
218 | |
219 | |
220 | static std::unique_ptr<Yarr::BytecodePattern> byteCodeCompilePattern(VM* vm, Yarr::YarrPattern& pattern, Yarr::ErrorCode& errorCode) |
221 | { |
222 | return Yarr::byteCompile(pattern, &vm->m_regExpAllocator, errorCode, &vm->m_regExpAllocatorLock); |
223 | } |
224 | |
225 | void RegExp::byteCodeCompileIfNecessary(VM* vm) |
226 | { |
227 | if (m_regExpBytecode) |
228 | return; |
229 | |
230 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit()); |
231 | if (hasError(m_constructionErrorCode)) { |
232 | RELEASE_ASSERT_NOT_REACHED(); |
233 | #if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE) |
234 | m_state = ParseError; |
235 | return; |
236 | #endif |
237 | } |
238 | ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); |
239 | |
240 | m_regExpBytecode = byteCodeCompilePattern(vm, pattern, m_constructionErrorCode); |
241 | if (!m_regExpBytecode) { |
242 | m_state = ParseError; |
243 | return; |
244 | } |
245 | } |
246 | |
247 | void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize) |
248 | { |
249 | auto locker = holdLock(cellLock()); |
250 | |
251 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit()); |
252 | if (hasError(m_constructionErrorCode)) { |
253 | m_state = ParseError; |
254 | return; |
255 | } |
256 | ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); |
257 | |
258 | if (!hasCode()) { |
259 | ASSERT(m_state == NotCompiled); |
260 | vm->regExpCache()->addToStrongCache(this); |
261 | m_state = ByteCode; |
262 | } |
263 | |
264 | #if ENABLE(YARR_JIT) |
265 | if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT() |
266 | #if !ENABLE(YARR_JIT_BACKREFERENCES) |
267 | && !pattern.m_containsBackreferences |
268 | #endif |
269 | ) { |
270 | auto& jitCode = ensureRegExpJITCode(); |
271 | Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode); |
272 | if (!jitCode.failureReason()) { |
273 | m_state = JITCode; |
274 | return; |
275 | } |
276 | } |
277 | #else |
278 | UNUSED_PARAM(charSize); |
279 | #endif |
280 | |
281 | if (Options::dumpCompiledRegExpPatterns()) |
282 | dataLog("Can't JIT this regular expression: \"" , m_patternString, "\"\n" ); |
283 | |
284 | m_state = ByteCode; |
285 | m_regExpBytecode = byteCodeCompilePattern(vm, pattern, m_constructionErrorCode); |
286 | if (!m_regExpBytecode) { |
287 | m_state = ParseError; |
288 | return; |
289 | } |
290 | } |
291 | |
292 | int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int>& ovector) |
293 | { |
294 | return matchInline(vm, s, startOffset, ovector); |
295 | } |
296 | |
297 | bool RegExp::matchConcurrently( |
298 | VM& vm, const String& s, unsigned startOffset, int& position, Vector<int>& ovector) |
299 | { |
300 | auto locker = holdLock(cellLock()); |
301 | |
302 | if (!hasCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16)) |
303 | return false; |
304 | |
305 | position = match(vm, s, startOffset, ovector); |
306 | return true; |
307 | } |
308 | |
309 | void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize) |
310 | { |
311 | auto locker = holdLock(cellLock()); |
312 | |
313 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit()); |
314 | if (hasError(m_constructionErrorCode)) { |
315 | m_state = ParseError; |
316 | return; |
317 | } |
318 | ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); |
319 | |
320 | if (!hasCode()) { |
321 | ASSERT(m_state == NotCompiled); |
322 | vm->regExpCache()->addToStrongCache(this); |
323 | m_state = ByteCode; |
324 | } |
325 | |
326 | #if ENABLE(YARR_JIT) |
327 | if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT() |
328 | #if !ENABLE(YARR_JIT_BACKREFERENCES) |
329 | && !pattern.m_containsBackreferences |
330 | #endif |
331 | ) { |
332 | auto& jitCode = ensureRegExpJITCode(); |
333 | Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode, Yarr::MatchOnly); |
334 | if (!jitCode.failureReason()) { |
335 | m_state = JITCode; |
336 | return; |
337 | } |
338 | } |
339 | #else |
340 | UNUSED_PARAM(charSize); |
341 | #endif |
342 | |
343 | if (Options::dumpCompiledRegExpPatterns()) |
344 | dataLog("Can't JIT this regular expression: \"" , m_patternString, "\"\n" ); |
345 | |
346 | m_state = ByteCode; |
347 | m_regExpBytecode = byteCodeCompilePattern(vm, pattern, m_constructionErrorCode); |
348 | if (!m_regExpBytecode) { |
349 | m_state = ParseError; |
350 | return; |
351 | } |
352 | } |
353 | |
354 | MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset) |
355 | { |
356 | return matchInline(vm, s, startOffset); |
357 | } |
358 | |
359 | bool RegExp::matchConcurrently(VM& vm, const String& s, unsigned startOffset, MatchResult& result) |
360 | { |
361 | auto locker = holdLock(cellLock()); |
362 | |
363 | if (!hasMatchOnlyCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16)) |
364 | return false; |
365 | |
366 | result = match(vm, s, startOffset); |
367 | return true; |
368 | } |
369 | |
370 | void RegExp::deleteCode() |
371 | { |
372 | auto locker = holdLock(cellLock()); |
373 | |
374 | if (!hasCode()) |
375 | return; |
376 | m_state = NotCompiled; |
377 | #if ENABLE(YARR_JIT) |
378 | if (m_regExpJITCode) |
379 | m_regExpJITCode->clear(); |
380 | #endif |
381 | m_regExpBytecode = nullptr; |
382 | } |
383 | |
384 | #if ENABLE(YARR_JIT_DEBUG) |
385 | void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult) |
386 | { |
387 | int offsetVectorSize = (m_numSubpatterns + 1) * 2; |
388 | Vector<int> interpreterOvector; |
389 | interpreterOvector.resize(offsetVectorSize); |
390 | int* interpreterOffsetVector = interpreterOvector.data(); |
391 | int interpreterResult = 0; |
392 | int differences = 0; |
393 | |
394 | // Initialize interpreterOffsetVector with the return value (index 0) and the |
395 | // first subpattern start indicies (even index values) set to -1. |
396 | // No need to init the subpattern end indicies. |
397 | for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++) |
398 | interpreterOffsetVector[j] = -1; |
399 | |
400 | interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(interpreterOffsetVector)); |
401 | |
402 | if (jitResult != interpreterResult) |
403 | differences++; |
404 | |
405 | for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) |
406 | if ((offsetVector[j] != interpreterOffsetVector[j]) |
407 | || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))) |
408 | differences++; |
409 | |
410 | if (differences) { |
411 | dataLogF("RegExp Discrepency for /%s/\n string input " , pattern().utf8().data()); |
412 | unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset); |
413 | |
414 | dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n" , s.utf8().data() + startOffset); |
415 | |
416 | if (jitResult != interpreterResult) { |
417 | dataLogF(" JIT result = %d, interpreted result = %d\n" , jitResult, interpreterResult); |
418 | differences--; |
419 | } else { |
420 | dataLogF(" Correct result = %d\n" , jitResult); |
421 | } |
422 | |
423 | if (differences) { |
424 | for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) { |
425 | if (offsetVector[j] != interpreterOffsetVector[j]) |
426 | dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n" , j, offsetVector[j], j, interpreterOffsetVector[j]); |
427 | if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])) |
428 | dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n" , j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]); |
429 | } |
430 | } |
431 | } |
432 | } |
433 | #endif |
434 | |
435 | #if ENABLE(REGEXP_TRACING) |
436 | void RegExp::printTraceData() |
437 | { |
438 | char formattedPattern[41]; |
439 | char rawPattern[41]; |
440 | |
441 | strncpy(rawPattern, pattern().utf8().data(), 40); |
442 | rawPattern[40]= '\0'; |
443 | |
444 | int pattLen = strlen(rawPattern); |
445 | |
446 | snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s..." , rawPattern); |
447 | |
448 | #if ENABLE(YARR_JIT) |
449 | const size_t jitAddrSize = 20; |
450 | char jit8BitMatchOnlyAddr[jitAddrSize] { }; |
451 | char jit16BitMatchOnlyAddr[jitAddrSize] { }; |
452 | char jit8BitMatchAddr[jitAddrSize] { }; |
453 | char jit16BitMatchAddr[jitAddrSize] { }; |
454 | switch (m_state) { |
455 | case ParseError: |
456 | case NotCompiled: |
457 | break; |
458 | case ByteCode: |
459 | snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback " ); |
460 | snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "---- " ); |
461 | snprintf(jit8BitMatchAddr, jitAddrSize, "fallback " ); |
462 | snprintf(jit16BitMatchAddr, jitAddrSize, "---- " ); |
463 | break; |
464 | case JITCode: { |
465 | Yarr::YarrCodeBlock& codeBlock = *m_regExpJITCode.get(); |
466 | snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchOnlyAddr())); |
467 | snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchOnlyAddr())); |
468 | snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchAddr())); |
469 | snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchAddr())); |
470 | break; |
471 | } |
472 | } |
473 | #else |
474 | const char* jit8BitMatchOnlyAddr = "JIT Off" ; |
475 | const char* jit16BitMatchOnlyAddr = "" ; |
476 | const char* jit8BitMatchAddr = "JIT Off" ; |
477 | const char* jit16BitMatchAddr = "" ; |
478 | #endif |
479 | unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount); |
480 | unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount); |
481 | |
482 | printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n" , formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen); |
483 | printf(" %16.16s %16.16s %10d %10d %10u\n" , jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen); |
484 | } |
485 | #endif |
486 | |
487 | static CString regexpToSourceString(const RegExp* regExp) |
488 | { |
489 | char postfix[7] = { '/', 0, 0, 0, 0, 0, 0 }; |
490 | int index = 1; |
491 | if (regExp->global()) |
492 | postfix[index++] = 'g'; |
493 | if (regExp->ignoreCase()) |
494 | postfix[index++] = 'i'; |
495 | if (regExp->multiline()) |
496 | postfix[index] = 'm'; |
497 | if (regExp->dotAll()) |
498 | postfix[index++] = 's'; |
499 | if (regExp->unicode()) |
500 | postfix[index++] = 'u'; |
501 | if (regExp->sticky()) |
502 | postfix[index++] = 'y'; |
503 | |
504 | return toCString("/" , regExp->pattern().impl(), postfix); |
505 | } |
506 | |
507 | void RegExp::dumpToStream(const JSCell* cell, PrintStream& out) |
508 | { |
509 | out.print(regexpToSourceString(jsCast<const RegExp*>(cell))); |
510 | } |
511 | |
512 | } // namespace JSC |
513 | |