1 | /* |
2 | * Copyright (C) 2004, 2008, 2009 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2008 Collabora Ltd. |
4 | * Copyright (C) 2011 Peter Varga ([email protected]), University of Szeged |
5 | * |
6 | * Redistribution and use in source and binary forms, with or without |
7 | * modification, are permitted provided that the following conditions |
8 | * are met: |
9 | * 1. Redistributions of source code must retain the above copyright |
10 | * notice, this list of conditions and the following disclaimer. |
11 | * 2. Redistributions in binary form must reproduce the above copyright |
12 | * notice, this list of conditions and the following disclaimer in the |
13 | * documentation and/or other materials provided with the distribution. |
14 | * |
15 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | */ |
27 | |
28 | #include "config.h" |
29 | #include "RegularExpression.h" |
30 | |
31 | #include "Yarr.h" |
32 | #include "YarrFlags.h" |
33 | #include "YarrInterpreter.h" |
34 | #include <wtf/Assertions.h> |
35 | #include <wtf/BumpPointerAllocator.h> |
36 | |
37 | namespace JSC { namespace Yarr { |
38 | |
39 | class RegularExpression::Private : public RefCounted<RegularExpression::Private> { |
40 | public: |
41 | static Ref<Private> create(const String& pattern, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode, UnicodeMode unicodeMode) |
42 | { |
43 | return adoptRef(*new Private(pattern, caseSensitivity, multilineMode, unicodeMode)); |
44 | } |
45 | |
46 | private: |
47 | Private(const String& pattern, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode, UnicodeMode unicodeMode) |
48 | : m_regExpByteCode(compile(pattern, caseSensitivity, multilineMode, unicodeMode)) |
49 | { |
50 | } |
51 | |
52 | std::unique_ptr<JSC::Yarr::BytecodePattern> compile(const String& patternString, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode, UnicodeMode unicodeMode) |
53 | { |
54 | OptionSet<JSC::Yarr::Flags> flags; |
55 | |
56 | if (caseSensitivity == TextCaseInsensitive) |
57 | flags.add(Flags::IgnoreCase); |
58 | |
59 | if (multilineMode == MultilineEnabled) |
60 | flags.add(Flags::Multiline); |
61 | |
62 | if (unicodeMode == UnicodeAwareMode) |
63 | flags.add(Flags::Unicode); |
64 | |
65 | JSC::Yarr::YarrPattern pattern(patternString, flags, m_constructionErrorCode); |
66 | if (JSC::Yarr::hasError(m_constructionErrorCode)) { |
67 | LOG_ERROR("RegularExpression: YARR compile failed with '%s'" , JSC::Yarr::errorMessage(m_constructionErrorCode)); |
68 | return nullptr; |
69 | } |
70 | |
71 | m_numSubpatterns = pattern.m_numSubpatterns; |
72 | |
73 | return JSC::Yarr::byteCompile(pattern, &m_regexAllocator, m_constructionErrorCode); |
74 | } |
75 | |
76 | JSC::Yarr::ErrorCode m_constructionErrorCode { Yarr::ErrorCode::NoError }; |
77 | BumpPointerAllocator m_regexAllocator; |
78 | |
79 | public: |
80 | int lastMatchLength { -1 }; |
81 | unsigned m_numSubpatterns; |
82 | std::unique_ptr<JSC::Yarr::BytecodePattern> m_regExpByteCode; |
83 | }; |
84 | |
85 | RegularExpression::RegularExpression(const String& pattern, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode, UnicodeMode unicodeMode) |
86 | : d(Private::create(pattern, caseSensitivity, multilineMode, unicodeMode)) |
87 | { |
88 | } |
89 | |
90 | RegularExpression::RegularExpression(const RegularExpression& re) |
91 | : d(re.d) |
92 | { |
93 | } |
94 | |
95 | RegularExpression::~RegularExpression() |
96 | { |
97 | } |
98 | |
99 | RegularExpression& RegularExpression::operator=(const RegularExpression& re) |
100 | { |
101 | d = re.d; |
102 | return *this; |
103 | } |
104 | |
105 | int RegularExpression::match(const String& str, int startFrom, int* matchLength) const |
106 | { |
107 | if (!d->m_regExpByteCode) |
108 | return -1; |
109 | |
110 | if (str.isNull()) |
111 | return -1; |
112 | |
113 | int offsetVectorSize = (d->m_numSubpatterns + 1) * 2; |
114 | unsigned* offsetVector; |
115 | Vector<unsigned, 32> nonReturnedOvector; |
116 | |
117 | nonReturnedOvector.grow(offsetVectorSize); |
118 | offsetVector = nonReturnedOvector.data(); |
119 | |
120 | ASSERT(offsetVector); |
121 | for (unsigned j = 0, i = 0; i < d->m_numSubpatterns + 1; j += 2, i++) |
122 | offsetVector[j] = JSC::Yarr::offsetNoMatch; |
123 | |
124 | unsigned result; |
125 | if (str.length() <= INT_MAX) |
126 | result = JSC::Yarr::interpret(d->m_regExpByteCode.get(), str, startFrom, offsetVector); |
127 | else { |
128 | // This code can't handle unsigned offsets. Limit our processing to strings with offsets that |
129 | // can be represented as ints. |
130 | result = JSC::Yarr::offsetNoMatch; |
131 | } |
132 | |
133 | if (result == JSC::Yarr::offsetNoMatch) { |
134 | d->lastMatchLength = -1; |
135 | return -1; |
136 | } |
137 | |
138 | // 1 means 1 match; 0 means more than one match. First match is recorded in offsetVector. |
139 | d->lastMatchLength = offsetVector[1] - offsetVector[0]; |
140 | if (matchLength) |
141 | *matchLength = d->lastMatchLength; |
142 | return offsetVector[0]; |
143 | } |
144 | |
145 | int RegularExpression::searchRev(const String& str) const |
146 | { |
147 | // FIXME: This could be faster if it actually searched backwards. |
148 | // Instead, it just searches forwards, multiple times until it finds the last match. |
149 | |
150 | int start = 0; |
151 | int pos; |
152 | int lastPos = -1; |
153 | int lastMatchLength = -1; |
154 | do { |
155 | int matchLength; |
156 | pos = match(str, start, &matchLength); |
157 | if (pos >= 0) { |
158 | if (pos + matchLength > lastPos + lastMatchLength) { |
159 | // replace last match if this one is later and not a subset of the last match |
160 | lastPos = pos; |
161 | lastMatchLength = matchLength; |
162 | } |
163 | start = pos + 1; |
164 | } |
165 | } while (pos != -1); |
166 | d->lastMatchLength = lastMatchLength; |
167 | return lastPos; |
168 | } |
169 | |
170 | int RegularExpression::matchedLength() const |
171 | { |
172 | return d->lastMatchLength; |
173 | } |
174 | |
175 | void replace(String& string, const RegularExpression& target, const String& replacement) |
176 | { |
177 | int index = 0; |
178 | while (index < static_cast<int>(string.length())) { |
179 | int matchLength; |
180 | index = target.match(string, index, &matchLength); |
181 | if (index < 0) |
182 | break; |
183 | string.replace(index, matchLength, replacement); |
184 | index += replacement.length(); |
185 | if (!matchLength) |
186 | break; // Avoid infinite loop on 0-length matches, e.g. [a-z]* |
187 | } |
188 | } |
189 | |
190 | bool RegularExpression::isValid() const |
191 | { |
192 | return d->m_regExpByteCode.get(); |
193 | } |
194 | |
195 | } } // namespace JSC::Yarr |
196 | |