1/*
2 * Copyright (C) 1999-2000 Harri Porten ([email protected])
3 * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
4 * Copyright (C) 2010 Zoltan Herczeg ([email protected])
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23#pragma once
24
25#include "Lookup.h"
26#include "ParserArena.h"
27#include "ParserModes.h"
28#include "ParserTokens.h"
29#include "SourceCode.h"
30#include <wtf/ASCIICType.h>
31#include <wtf/Vector.h>
32
33namespace JSC {
34
35enum LexerFlags {
36 LexerFlagsIgnoreReservedWords = 1,
37 LexerFlagsDontBuildStrings = 2,
38 LexexFlagsDontBuildKeywords = 4
39};
40
41enum class LexerEscapeParseMode { Template, String };
42
43struct ParsedUnicodeEscapeValue;
44
45bool isLexerKeyword(const Identifier&);
46
47template <typename T>
48class Lexer {
49 WTF_MAKE_NONCOPYABLE(Lexer);
50 WTF_MAKE_FAST_ALLOCATED;
51
52public:
53 Lexer(VM*, JSParserBuiltinMode, JSParserScriptMode);
54 ~Lexer();
55
56 // Character manipulation functions.
57 static bool isWhiteSpace(T character);
58 static bool isLineTerminator(T character);
59 static unsigned char convertHex(int c1, int c2);
60 static UChar convertUnicode(int c1, int c2, int c3, int c4);
61
62 // Functions to set up parsing.
63 void setCode(const SourceCode&, ParserArena*);
64 void setIsReparsingFunction() { m_isReparsingFunction = true; }
65 bool isReparsingFunction() const { return m_isReparsingFunction; }
66
67 JSTokenType lex(JSToken*, unsigned, bool strictMode);
68 JSTokenType lexWithoutClearingLineTerminator(JSToken*, unsigned, bool strictMode);
69 bool nextTokenIsColon();
70 int lineNumber() const { return m_lineNumber; }
71 ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
72 ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
73 ALWAYS_INLINE JSTextPosition currentPosition() const
74 {
75 return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset());
76 }
77 JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; }
78 JSTokenLocation lastTokenLocation() const { return m_lastTokenLocation; }
79 void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
80 int lastLineNumber() const { return m_lastLineNumber; }
81 bool hasLineTerminatorBeforeToken() const { return m_hasLineTerminatorBeforeToken; }
82 JSTokenType scanRegExp(JSToken*, UChar patternPrefix = 0);
83 enum class RawStringsBuildMode { BuildRawStrings, DontBuildRawStrings };
84 JSTokenType scanTemplateString(JSToken*, RawStringsBuildMode);
85
86 // Functions for use after parsing.
87 bool sawError() const { return m_error; }
88 void setSawError(bool sawError) { m_error = sawError; }
89 String getErrorMessage() const { return m_lexErrorMessage; }
90 void setErrorMessage(const String& errorMessage) { m_lexErrorMessage = errorMessage; }
91 String sourceURLDirective() const { return m_sourceURLDirective; }
92 String sourceMappingURLDirective() const { return m_sourceMappingURLDirective; }
93 void clear();
94 void setOffset(int offset, int lineStartOffset)
95 {
96 m_error = 0;
97 m_lexErrorMessage = String();
98
99 m_code = sourcePtrFromOffset(offset);
100 m_lineStart = sourcePtrFromOffset(lineStartOffset);
101 ASSERT(currentOffset() >= currentLineStartOffset());
102
103 m_buffer8.shrink(0);
104 m_buffer16.shrink(0);
105 if (LIKELY(m_code < m_codeEnd))
106 m_current = *m_code;
107 else
108 m_current = 0;
109 }
110 void setLineNumber(int line)
111 {
112 m_lineNumber = line;
113 }
114 void setHasLineTerminatorBeforeToken(bool terminator)
115 {
116 m_hasLineTerminatorBeforeToken = terminator;
117 }
118
119 JSTokenType lexExpectIdentifier(JSToken*, unsigned, bool strictMode);
120
121 ALWAYS_INLINE StringView getToken(const JSToken& token)
122 {
123 SourceProvider* sourceProvider = m_source->provider();
124 ASSERT_WITH_MESSAGE(token.m_location.startOffset <= token.m_location.endOffset, "Calling this function with the baked token.");
125 return sourceProvider->getRange(token.m_location.startOffset, token.m_location.endOffset);
126 }
127
128private:
129 void record8(int);
130 void append8(const T*, size_t);
131 void record16(int);
132 void record16(T);
133 void recordUnicodeCodePoint(UChar32);
134 void append16(const LChar*, size_t);
135 void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
136
137 ALWAYS_INLINE void shift();
138 ALWAYS_INLINE bool atEnd() const;
139 ALWAYS_INLINE T peek(int offset) const;
140
141 ParsedUnicodeEscapeValue parseUnicodeEscape();
142 void shiftLineTerminator();
143
144 ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
145 ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
146
147 String invalidCharacterMessage() const;
148 ALWAYS_INLINE const T* currentSourcePtr() const;
149 ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
150
151 ALWAYS_INLINE void setCodeStart(const StringView&);
152
153 ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
154 ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
155 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
156 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
157 ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
158 ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
159 ALWAYS_INLINE const Identifier* makeEmptyIdentifier();
160
161 ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
162
163 ALWAYS_INLINE void skipWhitespace();
164
165 template <int shiftAmount> void internalShift();
166 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
167 template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
168 template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
169 enum StringParseResult {
170 StringParsedSuccessfully,
171 StringUnterminated,
172 StringCannotBeParsed
173 };
174 template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
175 template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
176
177
178 template <bool shouldBuildStrings, LexerEscapeParseMode escapeParseMode> ALWAYS_INLINE StringParseResult parseComplexEscape(bool strictMode, T stringQuoteCharacter);
179 ALWAYS_INLINE StringParseResult parseTemplateLiteral(JSTokenData*, RawStringsBuildMode);
180
181 using NumberParseResult = Variant<double, const Identifier*>;
182 ALWAYS_INLINE Optional<NumberParseResult> parseHex();
183 ALWAYS_INLINE Optional<NumberParseResult> parseBinary();
184 ALWAYS_INLINE Optional<NumberParseResult> parseOctal();
185 ALWAYS_INLINE Optional<NumberParseResult> parseDecimal();
186 ALWAYS_INLINE bool parseNumberAfterDecimalPoint();
187 ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
188 ALWAYS_INLINE bool parseMultilineComment();
189
190 ALWAYS_INLINE void parseCommentDirective();
191 ALWAYS_INLINE String parseCommentDirectiveValue();
192
193 template <unsigned length>
194 ALWAYS_INLINE bool consume(const char (&input)[length]);
195
196 void fillTokenInfo(JSToken*, JSTokenType, int lineNumber, int endOffset, int lineStartOffset, JSTextPosition endPosition);
197
198 static const size_t initialReadBufferCapacity = 32;
199
200 int m_lineNumber;
201 int m_lastLineNumber;
202
203 Vector<LChar> m_buffer8;
204 Vector<UChar> m_buffer16;
205 Vector<UChar> m_bufferForRawTemplateString16;
206 bool m_hasLineTerminatorBeforeToken;
207 int m_lastToken;
208
209 const SourceCode* m_source;
210 unsigned m_sourceOffset;
211 const T* m_code;
212 const T* m_codeStart;
213 const T* m_codeEnd;
214 const T* m_codeStartPlusOffset;
215 const T* m_lineStart;
216 JSTextPosition m_positionBeforeLastNewline;
217 JSTokenLocation m_lastTokenLocation;
218 bool m_isReparsingFunction;
219 bool m_atLineStart;
220 bool m_error;
221 String m_lexErrorMessage;
222
223 String m_sourceURLDirective;
224 String m_sourceMappingURLDirective;
225
226 T m_current;
227
228 IdentifierArena* m_arena;
229
230 VM* m_vm;
231 bool m_parsingBuiltinFunction;
232 JSParserScriptMode m_scriptMode;
233};
234
235template <>
236ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
237{
238 return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
239}
240
241template <>
242ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
243{
244 return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR || ch == 0xFEFF);
245}
246
247template <>
248ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
249{
250 return ch == '\r' || ch == '\n';
251}
252
253template <>
254ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
255{
256 return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
257}
258
259template <typename T>
260inline unsigned char Lexer<T>::convertHex(int c1, int c2)
261{
262 return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
263}
264
265template <typename T>
266inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
267{
268 return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
269}
270
271template <typename T>
272ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
273{
274 return &m_arena->makeIdentifier(m_vm, characters, length);
275}
276
277template <typename T>
278ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
279{
280 return &m_arena->makeIdentifier(m_vm, characters, length);
281}
282
283template <>
284ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
285{
286 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
287}
288
289template <>
290ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
291{
292 if (!(orAllChars & ~0xff))
293 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
294
295 return &m_arena->makeIdentifier(m_vm, characters, length);
296}
297
298template <typename T>
299ALWAYS_INLINE const Identifier* Lexer<T>::makeEmptyIdentifier()
300{
301 return &m_arena->makeEmptyIdentifier(m_vm);
302}
303
304template <>
305ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringView& sourceString)
306{
307 ASSERT(sourceString.is8Bit());
308 m_codeStart = sourceString.characters8();
309}
310
311template <>
312ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringView& sourceString)
313{
314 ASSERT(!sourceString.is8Bit());
315 m_codeStart = sourceString.characters16();
316}
317
318template <typename T>
319ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
320{
321 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
322}
323
324template <typename T>
325ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
326{
327 return &m_arena->makeIdentifier(m_vm, characters, length);
328}
329
330template <typename T>
331ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
332{
333 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
334}
335
336#if ASSERT_DISABLED
337ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier*) { return true; }
338#else
339bool isSafeBuiltinIdentifier(VM&, const Identifier*);
340#endif
341
342template <typename T>
343ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
344{
345 JSTokenData* tokenData = &tokenRecord->m_data;
346 JSTokenLocation* tokenLocation = &tokenRecord->m_location;
347 ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
348 const T* start = m_code;
349 const T* ptr = start;
350 const T* end = m_codeEnd;
351 JSTextPosition startPosition = currentPosition();
352 if (ptr >= end) {
353 ASSERT(ptr == end);
354 goto slowCase;
355 }
356 if (!WTF::isASCIIAlpha(*ptr))
357 goto slowCase;
358 ++ptr;
359 while (ptr < end) {
360 if (!WTF::isASCIIAlphanumeric(*ptr))
361 break;
362 ++ptr;
363 }
364
365 // Here's the shift
366 if (ptr < end) {
367 if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
368 goto slowCase;
369 m_current = *ptr;
370 } else
371 m_current = 0;
372
373 m_code = ptr;
374 ASSERT(currentOffset() >= currentLineStartOffset());
375
376 // Create the identifier if needed
377 if (lexerFlags & LexexFlagsDontBuildKeywords
378#if !ASSERT_DISABLED
379 && !m_parsingBuiltinFunction
380#endif
381 )
382 tokenData->ident = 0;
383 else
384 tokenData->ident = makeLCharIdentifier(start, ptr - start);
385
386 tokenLocation->line = m_lineNumber;
387 tokenLocation->lineStartOffset = currentLineStartOffset();
388 tokenLocation->startOffset = offsetFromSourcePtr(start);
389 tokenLocation->endOffset = currentOffset();
390 ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
391 tokenRecord->m_startPosition = startPosition;
392 tokenRecord->m_endPosition = currentPosition();
393#if !ASSERT_DISABLED
394 if (m_parsingBuiltinFunction) {
395 if (!isSafeBuiltinIdentifier(*m_vm, tokenData->ident))
396 return ERRORTOK;
397 }
398#endif
399
400 m_lastToken = IDENT;
401 return IDENT;
402
403slowCase:
404 return lex(tokenRecord, lexerFlags, strictMode);
405}
406
407template <typename T>
408ALWAYS_INLINE JSTokenType Lexer<T>::lex(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
409{
410 m_hasLineTerminatorBeforeToken = false;
411 return lexWithoutClearingLineTerminator(tokenRecord, lexerFlags, strictMode);
412}
413
414} // namespace JSC
415