1/*
2 * Copyright (C) 2016-2019 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include <wtf/URLParser.h>
28
29#include <array>
30#include <mutex>
31#include <unicode/uidna.h>
32#include <unicode/utf8.h>
33#include <unicode/utypes.h>
34
35namespace WTF {
36
37#define URL_PARSER_DEBUGGING 0
38
39#if URL_PARSER_DEBUGGING
40#define URL_PARSER_LOG(...) WTFLogAlways(__VA_ARGS__)
41#else
42#define URL_PARSER_LOG(...)
43#endif
44
45template<typename CharacterType>
46class CodePointIterator {
47 WTF_MAKE_FAST_ALLOCATED;
48public:
49 ALWAYS_INLINE CodePointIterator() { }
50 ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51 : m_begin(begin)
52 , m_end(end)
53 {
54 }
55
56 ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57 : CodePointIterator(begin.m_begin, end.m_begin)
58 {
59 ASSERT(end.m_begin >= begin.m_begin);
60 }
61
62 ALWAYS_INLINE UChar32 operator*() const;
63 ALWAYS_INLINE CodePointIterator& operator++();
64
65 ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66 {
67 return m_begin == other.m_begin
68 && m_end == other.m_end;
69 }
70 ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71
72 ALWAYS_INLINE bool atEnd() const
73 {
74 ASSERT(m_begin <= m_end);
75 return m_begin >= m_end;
76 }
77
78 ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
79 {
80 ASSERT(m_begin >= reference);
81 return m_begin - reference;
82 }
83
84 ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
85 {
86 return codeUnitsSince(other.m_begin);
87 }
88
89private:
90 const CharacterType* m_begin { nullptr };
91 const CharacterType* m_end { nullptr };
92};
93
94template<>
95ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
96{
97 ASSERT(!atEnd());
98 return *m_begin;
99}
100
101template<>
102ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
103{
104 m_begin++;
105 return *this;
106}
107
108template<>
109ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
110{
111 ASSERT(!atEnd());
112 UChar32 c;
113 U16_GET(m_begin, 0, 0, m_end - m_begin, c);
114 return c;
115}
116
117template<>
118ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
119{
120 unsigned i = 0;
121 size_t length = m_end - m_begin;
122 U16_FWD_1(m_begin, i, length);
123 m_begin += i;
124 return *this;
125}
126
127ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
128{
129 if (U_IS_BMP(codePoint)) {
130 destination.append(static_cast<UChar>(codePoint));
131 return;
132 }
133 destination.reserveCapacity(destination.size() + 2);
134 destination.uncheckedAppend(U16_LEAD(codePoint));
135 destination.uncheckedAppend(U16_TRAIL(codePoint));
136}
137
138enum URLCharacterClass {
139 UserInfo = 0x1,
140 Default = 0x2,
141 ForbiddenHost = 0x4,
142 QueryPercent = 0x8,
143 SlashQuestionOrHash = 0x10,
144 ValidScheme = 0x20,
145};
146
147static const uint8_t characterClassTable[256] = {
148 UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
149 UserInfo | Default | QueryPercent, // 0x1
150 UserInfo | Default | QueryPercent, // 0x2
151 UserInfo | Default | QueryPercent, // 0x3
152 UserInfo | Default | QueryPercent, // 0x4
153 UserInfo | Default | QueryPercent, // 0x5
154 UserInfo | Default | QueryPercent, // 0x6
155 UserInfo | Default | QueryPercent, // 0x7
156 UserInfo | Default | QueryPercent, // 0x8
157 UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
158 UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
159 UserInfo | Default | QueryPercent, // 0xB
160 UserInfo | Default | QueryPercent, // 0xC
161 UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
162 UserInfo | Default | QueryPercent, // 0xE
163 UserInfo | Default | QueryPercent, // 0xF
164 UserInfo | Default | QueryPercent, // 0x10
165 UserInfo | Default | QueryPercent, // 0x11
166 UserInfo | Default | QueryPercent, // 0x12
167 UserInfo | Default | QueryPercent, // 0x13
168 UserInfo | Default | QueryPercent, // 0x14
169 UserInfo | Default | QueryPercent, // 0x15
170 UserInfo | Default | QueryPercent, // 0x16
171 UserInfo | Default | QueryPercent, // 0x17
172 UserInfo | Default | QueryPercent, // 0x18
173 UserInfo | Default | QueryPercent, // 0x19
174 UserInfo | Default | QueryPercent, // 0x1A
175 UserInfo | Default | QueryPercent, // 0x1B
176 UserInfo | Default | QueryPercent, // 0x1C
177 UserInfo | Default | QueryPercent, // 0x1D
178 UserInfo | Default | QueryPercent, // 0x1E
179 UserInfo | Default | QueryPercent, // 0x1F
180 UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
181 0, // '!'
182 UserInfo | Default | QueryPercent, // '"'
183 UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
184 0, // '$'
185 ForbiddenHost, // '%'
186 0, // '&'
187 0, // '\''
188 0, // '('
189 0, // ')'
190 0, // '*'
191 ValidScheme, // '+'
192 0, // ','
193 ValidScheme, // '-'
194 ValidScheme, // '.'
195 UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
196 ValidScheme, // '0'
197 ValidScheme, // '1'
198 ValidScheme, // '2'
199 ValidScheme, // '3'
200 ValidScheme, // '4'
201 ValidScheme, // '5'
202 ValidScheme, // '6'
203 ValidScheme, // '7'
204 ValidScheme, // '8'
205 ValidScheme, // '9'
206 UserInfo | ForbiddenHost, // ':'
207 UserInfo, // ';'
208 UserInfo | Default | QueryPercent | ForbiddenHost, // '<'
209 UserInfo, // '='
210 UserInfo | Default | QueryPercent | ForbiddenHost, // '>'
211 UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
212 UserInfo | ForbiddenHost, // '@'
213 ValidScheme, // 'A'
214 ValidScheme, // 'B'
215 ValidScheme, // 'C'
216 ValidScheme, // 'D'
217 ValidScheme, // 'E'
218 ValidScheme, // 'F'
219 ValidScheme, // 'G'
220 ValidScheme, // 'H'
221 ValidScheme, // 'I'
222 ValidScheme, // 'J'
223 ValidScheme, // 'K'
224 ValidScheme, // 'L'
225 ValidScheme, // 'M'
226 ValidScheme, // 'N'
227 ValidScheme, // 'O'
228 ValidScheme, // 'P'
229 ValidScheme, // 'Q'
230 ValidScheme, // 'R'
231 ValidScheme, // 'S'
232 ValidScheme, // 'T'
233 ValidScheme, // 'U'
234 ValidScheme, // 'V'
235 ValidScheme, // 'W'
236 ValidScheme, // 'X'
237 ValidScheme, // 'Y'
238 ValidScheme, // 'Z'
239 UserInfo | ForbiddenHost, // '['
240 UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
241 UserInfo | ForbiddenHost, // ']'
242 UserInfo, // '^'
243 0, // '_'
244 UserInfo | Default, // '`'
245 ValidScheme, // 'a'
246 ValidScheme, // 'b'
247 ValidScheme, // 'c'
248 ValidScheme, // 'd'
249 ValidScheme, // 'e'
250 ValidScheme, // 'f'
251 ValidScheme, // 'g'
252 ValidScheme, // 'h'
253 ValidScheme, // 'i'
254 ValidScheme, // 'j'
255 ValidScheme, // 'k'
256 ValidScheme, // 'l'
257 ValidScheme, // 'm'
258 ValidScheme, // 'n'
259 ValidScheme, // 'o'
260 ValidScheme, // 'p'
261 ValidScheme, // 'q'
262 ValidScheme, // 'r'
263 ValidScheme, // 's'
264 ValidScheme, // 't'
265 ValidScheme, // 'u'
266 ValidScheme, // 'v'
267 ValidScheme, // 'w'
268 ValidScheme, // 'x'
269 ValidScheme, // 'y'
270 ValidScheme, // 'z'
271 UserInfo | Default, // '{'
272 UserInfo, // '|'
273 UserInfo | Default, // '}'
274 0, // '~'
275 QueryPercent, // 0x7F
276 QueryPercent, // 0x80
277 QueryPercent, // 0x81
278 QueryPercent, // 0x82
279 QueryPercent, // 0x83
280 QueryPercent, // 0x84
281 QueryPercent, // 0x85
282 QueryPercent, // 0x86
283 QueryPercent, // 0x87
284 QueryPercent, // 0x88
285 QueryPercent, // 0x89
286 QueryPercent, // 0x8A
287 QueryPercent, // 0x8B
288 QueryPercent, // 0x8C
289 QueryPercent, // 0x8D
290 QueryPercent, // 0x8E
291 QueryPercent, // 0x8F
292 QueryPercent, // 0x90
293 QueryPercent, // 0x91
294 QueryPercent, // 0x92
295 QueryPercent, // 0x93
296 QueryPercent, // 0x94
297 QueryPercent, // 0x95
298 QueryPercent, // 0x96
299 QueryPercent, // 0x97
300 QueryPercent, // 0x98
301 QueryPercent, // 0x99
302 QueryPercent, // 0x9A
303 QueryPercent, // 0x9B
304 QueryPercent, // 0x9C
305 QueryPercent, // 0x9D
306 QueryPercent, // 0x9E
307 QueryPercent, // 0x9F
308 QueryPercent, // 0xA0
309 QueryPercent, // 0xA1
310 QueryPercent, // 0xA2
311 QueryPercent, // 0xA3
312 QueryPercent, // 0xA4
313 QueryPercent, // 0xA5
314 QueryPercent, // 0xA6
315 QueryPercent, // 0xA7
316 QueryPercent, // 0xA8
317 QueryPercent, // 0xA9
318 QueryPercent, // 0xAA
319 QueryPercent, // 0xAB
320 QueryPercent, // 0xAC
321 QueryPercent, // 0xAD
322 QueryPercent, // 0xAE
323 QueryPercent, // 0xAF
324 QueryPercent, // 0xB0
325 QueryPercent, // 0xB1
326 QueryPercent, // 0xB2
327 QueryPercent, // 0xB3
328 QueryPercent, // 0xB4
329 QueryPercent, // 0xB5
330 QueryPercent, // 0xB6
331 QueryPercent, // 0xB7
332 QueryPercent, // 0xB8
333 QueryPercent, // 0xB9
334 QueryPercent, // 0xBA
335 QueryPercent, // 0xBB
336 QueryPercent, // 0xBC
337 QueryPercent, // 0xBD
338 QueryPercent, // 0xBE
339 QueryPercent, // 0xBF
340 QueryPercent, // 0xC0
341 QueryPercent, // 0xC1
342 QueryPercent, // 0xC2
343 QueryPercent, // 0xC3
344 QueryPercent, // 0xC4
345 QueryPercent, // 0xC5
346 QueryPercent, // 0xC6
347 QueryPercent, // 0xC7
348 QueryPercent, // 0xC8
349 QueryPercent, // 0xC9
350 QueryPercent, // 0xCA
351 QueryPercent, // 0xCB
352 QueryPercent, // 0xCC
353 QueryPercent, // 0xCD
354 QueryPercent, // 0xCE
355 QueryPercent, // 0xCF
356 QueryPercent, // 0xD0
357 QueryPercent, // 0xD1
358 QueryPercent, // 0xD2
359 QueryPercent, // 0xD3
360 QueryPercent, // 0xD4
361 QueryPercent, // 0xD5
362 QueryPercent, // 0xD6
363 QueryPercent, // 0xD7
364 QueryPercent, // 0xD8
365 QueryPercent, // 0xD9
366 QueryPercent, // 0xDA
367 QueryPercent, // 0xDB
368 QueryPercent, // 0xDC
369 QueryPercent, // 0xDD
370 QueryPercent, // 0xDE
371 QueryPercent, // 0xDF
372 QueryPercent, // 0xE0
373 QueryPercent, // 0xE1
374 QueryPercent, // 0xE2
375 QueryPercent, // 0xE3
376 QueryPercent, // 0xE4
377 QueryPercent, // 0xE5
378 QueryPercent, // 0xE6
379 QueryPercent, // 0xE7
380 QueryPercent, // 0xE8
381 QueryPercent, // 0xE9
382 QueryPercent, // 0xEA
383 QueryPercent, // 0xEB
384 QueryPercent, // 0xEC
385 QueryPercent, // 0xED
386 QueryPercent, // 0xEE
387 QueryPercent, // 0xEF
388 QueryPercent, // 0xF0
389 QueryPercent, // 0xF1
390 QueryPercent, // 0xF2
391 QueryPercent, // 0xF3
392 QueryPercent, // 0xF4
393 QueryPercent, // 0xF5
394 QueryPercent, // 0xF6
395 QueryPercent, // 0xF7
396 QueryPercent, // 0xF8
397 QueryPercent, // 0xF9
398 QueryPercent, // 0xFA
399 QueryPercent, // 0xFB
400 QueryPercent, // 0xFC
401 QueryPercent, // 0xFD
402 QueryPercent, // 0xFE
403 QueryPercent, // 0xFF
404};
405
406template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
407template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
408template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
409template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
410template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
411template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
412template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
413template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
414template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
415template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
416ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
417{
418 if (characterClassTable[byte] & QueryPercent)
419 return true;
420 if (byte == '\'' && urlIsSpecial)
421 return true;
422 return false;
423}
424
425bool URLParser::isInUserInfoEncodeSet(UChar c)
426{
427 return WTF::isInUserInfoEncodeSet(c);
428}
429
430template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
431ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
432{
433 ++iterator;
434 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
435 if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
436 syntaxViolation(iteratorForSyntaxViolationPosition);
437 ++iterator;
438 }
439}
440
441template<typename CharacterType>
442bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
443{
444 if (iterator.atEnd())
445 return false;
446 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
447 if (iterator.atEnd())
448 return false;
449 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
450 return iterator.atEnd();
451}
452
453template<typename CharacterType>
454ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
455{
456 if (iterator.atEnd() || !isASCIIAlpha(*iterator))
457 return false;
458 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
459 if (iterator.atEnd())
460 return false;
461 if (*iterator == ':')
462 return true;
463 if (UNLIKELY(*iterator == '|'))
464 return true;
465 return false;
466}
467
468ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
469{
470 ASSERT(isASCII(codePoint));
471 if (UNLIKELY(m_didSeeSyntaxViolation))
472 m_asciiBuffer.append(codePoint);
473}
474
475ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
476{
477 if (UNLIKELY(m_didSeeSyntaxViolation))
478 m_asciiBuffer.append(characters, length);
479}
480
481template<typename CharacterType>
482void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
483{
484 ASSERT(isWindowsDriveLetter(iterator));
485 appendToASCIIBuffer(*iterator);
486 advance(iterator);
487 ASSERT(!iterator.atEnd());
488 ASSERT(*iterator == ':' || *iterator == '|');
489 if (*iterator == '|')
490 syntaxViolation(iterator);
491 appendToASCIIBuffer(':');
492 advance(iterator);
493}
494
495bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
496{
497 if (base.protocolIs("file")) {
498 RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length());
499 if (base.m_string.is8Bit()) {
500 const LChar* begin = base.m_string.characters8();
501 CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
502 if (isWindowsDriveLetter(c)) {
503 appendWindowsDriveLetter(c);
504 return true;
505 }
506 } else {
507 const UChar* begin = base.m_string.characters16();
508 CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
509 if (isWindowsDriveLetter(c)) {
510 appendWindowsDriveLetter(c);
511 return true;
512 }
513 }
514 }
515 return false;
516}
517
518template<typename CharacterType>
519bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
520{
521 if (!isWindowsDriveLetter(iterator))
522 return true;
523 if (iterator.atEnd())
524 return false;
525 advance(iterator);
526 if (iterator.atEnd())
527 return true;
528 advance(iterator);
529 if (iterator.atEnd())
530 return true;
531 return !isSlashQuestionOrHash(*iterator);
532}
533
534static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
535{
536 buffer.append('%');
537 buffer.append(upperNibbleToASCIIHexDigit(byte));
538 buffer.append(lowerNibbleToASCIIHexDigit(byte));
539}
540
541void URLParser::percentEncodeByte(uint8_t byte)
542{
543 ASSERT(m_didSeeSyntaxViolation);
544 appendToASCIIBuffer('%');
545 appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
546 appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
547}
548
549const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
550const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
551
552template<bool(*isInCodeSet)(UChar32), typename CharacterType>
553ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
554{
555 ASSERT(!iterator.atEnd());
556 UChar32 codePoint = *iterator;
557 if (LIKELY(isASCII(codePoint))) {
558 if (UNLIKELY(isInCodeSet(codePoint))) {
559 syntaxViolation(iterator);
560 percentEncodeByte(codePoint);
561 } else
562 appendToASCIIBuffer(codePoint);
563 return;
564 }
565 ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
566 syntaxViolation(iterator);
567
568 if (!U_IS_UNICODE_CHAR(codePoint)) {
569 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
570 return;
571 }
572
573 uint8_t buffer[U8_MAX_LENGTH];
574 int32_t offset = 0;
575 U8_APPEND_UNSAFE(buffer, offset, codePoint);
576 for (int32_t i = 0; i < offset; ++i)
577 percentEncodeByte(buffer[i]);
578}
579
580template<typename CharacterType>
581ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
582{
583 ASSERT(!iterator.atEnd());
584 UChar32 codePoint = *iterator;
585 if (LIKELY(isASCII(codePoint))) {
586 if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
587 syntaxViolation(iterator);
588 percentEncodeByte(codePoint);
589 } else
590 appendToASCIIBuffer(codePoint);
591 return;
592 }
593
594 syntaxViolation(iterator);
595
596 if (!U_IS_UNICODE_CHAR(codePoint)) {
597 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
598 return;
599 }
600
601 uint8_t buffer[U8_MAX_LENGTH];
602 int32_t offset = 0;
603 U8_APPEND_UNSAFE(buffer, offset, codePoint);
604 for (int32_t i = 0; i < offset; ++i) {
605 auto byte = buffer[i];
606 if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
607 percentEncodeByte(byte);
608 else
609 appendToASCIIBuffer(byte);
610 }
611}
612
613template<typename CharacterType>
614void URLParser::encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding& encoding, CodePointIterator<CharacterType> iterator)
615{
616 auto encoded = encoding.encodeForURLParsing(StringView(source.data(), source.size()));
617 auto* data = encoded.data();
618 size_t length = encoded.size();
619
620 if (!length == !iterator.atEnd()) {
621 syntaxViolation(iterator);
622 return;
623 }
624
625 size_t i = 0;
626 for (; i < length; ++i) {
627 ASSERT(!iterator.atEnd());
628 uint8_t byte = data[i];
629 if (UNLIKELY(byte != *iterator)) {
630 syntaxViolation(iterator);
631 break;
632 }
633 if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
634 syntaxViolation(iterator);
635 break;
636 }
637 appendToASCIIBuffer(byte);
638 ++iterator;
639 }
640 while (!iterator.atEnd() && isTabOrNewline(*iterator))
641 ++iterator;
642 ASSERT((i == length) == iterator.atEnd());
643 for (; i < length; ++i) {
644 ASSERT(m_didSeeSyntaxViolation);
645 uint8_t byte = data[i];
646 if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
647 percentEncodeByte(byte);
648 else
649 appendToASCIIBuffer(byte);
650 }
651}
652
653Optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
654{
655 static constexpr uint16_t ftpPort = 21;
656 static constexpr uint16_t httpPort = 80;
657 static constexpr uint16_t httpsPort = 443;
658 static constexpr uint16_t wsPort = 80;
659 static constexpr uint16_t wssPort = 443;
660
661 auto length = scheme.length();
662 if (!length)
663 return WTF::nullopt;
664 switch (scheme[0]) {
665 case 'w':
666 switch (length) {
667 case 2:
668 if (scheme[1] == 's')
669 return wsPort;
670 return WTF::nullopt;
671 case 3:
672 if (scheme[1] == 's'
673 && scheme[2] == 's')
674 return wssPort;
675 return WTF::nullopt;
676 default:
677 return false;
678 }
679 case 'h':
680 switch (length) {
681 case 4:
682 if (scheme[1] == 't'
683 && scheme[2] == 't'
684 && scheme[3] == 'p')
685 return httpPort;
686 return WTF::nullopt;
687 case 5:
688 if (scheme[1] == 't'
689 && scheme[2] == 't'
690 && scheme[3] == 'p'
691 && scheme[4] == 's')
692 return httpsPort;
693 return WTF::nullopt;
694 default:
695 return WTF::nullopt;
696 }
697 case 'f':
698 if (length == 3
699 && scheme[1] == 't'
700 && scheme[2] == 'p')
701 return ftpPort;
702 return WTF::nullopt;
703 default:
704 return WTF::nullopt;
705 }
706}
707
708enum class Scheme {
709 WS,
710 WSS,
711 File,
712 FTP,
713 HTTP,
714 HTTPS,
715 NonSpecial
716};
717
718ALWAYS_INLINE static Scheme scheme(StringView scheme)
719{
720 auto length = scheme.length();
721 if (!length)
722 return Scheme::NonSpecial;
723 switch (scheme[0]) {
724 case 'f':
725 switch (length) {
726 case 3:
727 if (scheme[1] == 't'
728 && scheme[2] == 'p')
729 return Scheme::FTP;
730 return Scheme::NonSpecial;
731 case 4:
732 if (scheme[1] == 'i'
733 && scheme[2] == 'l'
734 && scheme[3] == 'e')
735 return Scheme::File;
736 return Scheme::NonSpecial;
737 default:
738 return Scheme::NonSpecial;
739 }
740 case 'h':
741 switch (length) {
742 case 4:
743 if (scheme[1] == 't'
744 && scheme[2] == 't'
745 && scheme[3] == 'p')
746 return Scheme::HTTP;
747 return Scheme::NonSpecial;
748 case 5:
749 if (scheme[1] == 't'
750 && scheme[2] == 't'
751 && scheme[3] == 'p'
752 && scheme[4] == 's')
753 return Scheme::HTTPS;
754 return Scheme::NonSpecial;
755 default:
756 return Scheme::NonSpecial;
757 }
758 case 'w':
759 switch (length) {
760 case 2:
761 if (scheme[1] == 's')
762 return Scheme::WS;
763 return Scheme::NonSpecial;
764 case 3:
765 if (scheme[1] == 's'
766 && scheme[2] == 's')
767 return Scheme::WSS;
768 return Scheme::NonSpecial;
769 default:
770 return Scheme::NonSpecial;
771 }
772 default:
773 return Scheme::NonSpecial;
774 }
775}
776
777Optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
778{
779 if (scheme.isEmpty())
780 return WTF::nullopt;
781
782 if (!isASCIIAlpha(scheme[0]))
783 return WTF::nullopt;
784
785 for (size_t i = 1; i < scheme.length(); ++i) {
786 if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
787 continue;
788 return WTF::nullopt;
789 }
790
791 return scheme.convertToASCIILowercase();
792}
793
794bool URLParser::isSpecialScheme(const String& schemeArg)
795{
796 return scheme(schemeArg) != Scheme::NonSpecial;
797}
798
799enum class URLParser::URLPart {
800 SchemeEnd,
801 UserStart,
802 UserEnd,
803 PasswordEnd,
804 HostEnd,
805 PortEnd,
806 PathAfterLastSlash,
807 PathEnd,
808 QueryEnd,
809};
810
811size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
812{
813 switch (part) {
814 case URLPart::QueryEnd:
815 return url.m_queryEnd;
816 case URLPart::PathEnd:
817 return url.m_pathEnd;
818 case URLPart::PathAfterLastSlash:
819 return url.m_pathAfterLastSlash;
820 case URLPart::PortEnd:
821 return url.m_hostEnd + url.m_portLength;
822 case URLPart::HostEnd:
823 return url.m_hostEnd;
824 case URLPart::PasswordEnd:
825 return url.m_passwordEnd;
826 case URLPart::UserEnd:
827 return url.m_userEnd;
828 case URLPart::UserStart:
829 return url.m_userStart;
830 case URLPart::SchemeEnd:
831 return url.m_schemeEnd;
832 }
833 ASSERT_NOT_REACHED();
834 return 0;
835}
836
837void URLParser::copyASCIIStringUntil(const String& string, size_t length)
838{
839 RELEASE_ASSERT(length <= string.length());
840 if (string.isNull())
841 return;
842 ASSERT(m_asciiBuffer.isEmpty());
843 if (string.is8Bit())
844 appendToASCIIBuffer(string.characters8(), length);
845 else {
846 const UChar* characters = string.characters16();
847 for (size_t i = 0; i < length; ++i) {
848 UChar c = characters[i];
849 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
850 appendToASCIIBuffer(c);
851 }
852 }
853}
854
855template<typename CharacterType>
856void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, const URLTextEncoding*& nonUTF8QueryEncoding)
857{
858 syntaxViolation(iterator);
859
860 m_asciiBuffer.clear();
861 copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
862 switch (part) {
863 case URLPart::QueryEnd:
864 m_url.m_queryEnd = base.m_queryEnd;
865 FALLTHROUGH;
866 case URLPart::PathEnd:
867 m_url.m_pathEnd = base.m_pathEnd;
868 FALLTHROUGH;
869 case URLPart::PathAfterLastSlash:
870 m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
871 FALLTHROUGH;
872 case URLPart::PortEnd:
873 m_url.m_portLength = base.m_portLength;
874 FALLTHROUGH;
875 case URLPart::HostEnd:
876 m_url.m_hostEnd = base.m_hostEnd;
877 FALLTHROUGH;
878 case URLPart::PasswordEnd:
879 m_url.m_passwordEnd = base.m_passwordEnd;
880 FALLTHROUGH;
881 case URLPart::UserEnd:
882 m_url.m_userEnd = base.m_userEnd;
883 FALLTHROUGH;
884 case URLPart::UserStart:
885 m_url.m_userStart = base.m_userStart;
886 FALLTHROUGH;
887 case URLPart::SchemeEnd:
888 m_url.m_isValid = base.m_isValid;
889 m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
890 m_url.m_schemeEnd = base.m_schemeEnd;
891 }
892 switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
893 case Scheme::WS:
894 case Scheme::WSS:
895 nonUTF8QueryEncoding = nullptr;
896 m_urlIsSpecial = true;
897 return;
898 case Scheme::File:
899 m_urlIsFile = true;
900 FALLTHROUGH;
901 case Scheme::FTP:
902 case Scheme::HTTP:
903 case Scheme::HTTPS:
904 m_urlIsSpecial = true;
905 return;
906 case Scheme::NonSpecial:
907 m_urlIsSpecial = false;
908 nonUTF8QueryEncoding = nullptr;
909 return;
910 }
911 ASSERT_NOT_REACHED();
912}
913
914static const char dotASCIICode[2] = {'2', 'e'};
915
916template<typename CharacterType>
917ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
918{
919 if (c.atEnd())
920 return false;
921 if (*c == '.') {
922 advance<CharacterType, ReportSyntaxViolation::No>(c);
923 return c.atEnd() || isSlashQuestionOrHash(*c);
924 }
925 if (*c != '%')
926 return false;
927 advance<CharacterType, ReportSyntaxViolation::No>(c);
928 if (c.atEnd() || *c != dotASCIICode[0])
929 return false;
930 advance<CharacterType, ReportSyntaxViolation::No>(c);
931 if (c.atEnd())
932 return false;
933 if (toASCIILower(*c) == dotASCIICode[1]) {
934 advance<CharacterType, ReportSyntaxViolation::No>(c);
935 return c.atEnd() || isSlashQuestionOrHash(*c);
936 }
937 return false;
938}
939
940template<typename CharacterType>
941ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
942{
943 if (c.atEnd())
944 return false;
945 if (*c == '.') {
946 advance<CharacterType, ReportSyntaxViolation::No>(c);
947 return isSingleDotPathSegment(c);
948 }
949 if (*c != '%')
950 return false;
951 advance<CharacterType, ReportSyntaxViolation::No>(c);
952 if (c.atEnd() || *c != dotASCIICode[0])
953 return false;
954 advance<CharacterType, ReportSyntaxViolation::No>(c);
955 if (c.atEnd())
956 return false;
957 if (toASCIILower(*c) == dotASCIICode[1]) {
958 advance<CharacterType, ReportSyntaxViolation::No>(c);
959 return isSingleDotPathSegment(c);
960 }
961 return false;
962}
963
964template<typename CharacterType>
965void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
966{
967 ASSERT(isSingleDotPathSegment(c));
968 if (*c == '.') {
969 advance(c);
970 if (!c.atEnd()) {
971 if (*c == '/' || *c == '\\')
972 advance(c);
973 else
974 ASSERT(*c == '?' || *c == '#');
975 }
976 } else {
977 ASSERT(*c == '%');
978 advance(c);
979 ASSERT(*c == dotASCIICode[0]);
980 advance(c);
981 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
982 advance(c);
983 if (!c.atEnd()) {
984 if (*c == '/' || *c == '\\')
985 advance(c);
986 else
987 ASSERT(*c == '?' || *c == '#');
988 }
989 }
990}
991
992template<typename CharacterType>
993void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
994{
995 ASSERT(isDoubleDotPathSegment(c));
996 if (*c == '.')
997 advance(c);
998 else {
999 ASSERT(*c == '%');
1000 advance(c);
1001 ASSERT(*c == dotASCIICode[0]);
1002 advance(c);
1003 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1004 advance(c);
1005 }
1006 consumeSingleDotPathSegment(c);
1007}
1008
1009bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1010{
1011 ASSERT(m_didSeeSyntaxViolation);
1012 if (!m_urlIsFile)
1013 return true;
1014
1015 ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1016 CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1017 if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + 1 && isWindowsDriveLetter(componentToPop))
1018 return false;
1019 return true;
1020}
1021
1022void URLParser::popPath()
1023{
1024 ASSERT(m_didSeeSyntaxViolation);
1025 if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + 1) {
1026 auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1027 if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1028 newPathAfterLastSlash--;
1029 while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer[newPathAfterLastSlash] != '/')
1030 newPathAfterLastSlash--;
1031 newPathAfterLastSlash++;
1032 if (shouldPopPath(newPathAfterLastSlash))
1033 m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1034 }
1035 m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1036}
1037
1038template<typename CharacterType>
1039void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1040{
1041 if (m_didSeeSyntaxViolation)
1042 return;
1043 m_didSeeSyntaxViolation = true;
1044
1045 ASSERT(m_asciiBuffer.isEmpty());
1046 size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1047 RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1048 m_asciiBuffer.reserveCapacity(m_inputString.length());
1049 for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1050 ASSERT(isASCII(m_inputString[i]));
1051 m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1052 }
1053}
1054
1055void URLParser::failure()
1056{
1057 m_url.invalidate();
1058 m_url.m_string = m_inputString;
1059}
1060
1061template<typename CharacterType>
1062bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1063{
1064 if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1065 return false;
1066 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1067 return true;
1068}
1069
1070template<typename CharacterType>
1071bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1072{
1073 if (!checkLocalhostCodePoint(iterator, 'l'))
1074 return false;
1075 if (!checkLocalhostCodePoint(iterator, 'o'))
1076 return false;
1077 if (!checkLocalhostCodePoint(iterator, 'c'))
1078 return false;
1079 if (!checkLocalhostCodePoint(iterator, 'a'))
1080 return false;
1081 if (!checkLocalhostCodePoint(iterator, 'l'))
1082 return false;
1083 if (!checkLocalhostCodePoint(iterator, 'h'))
1084 return false;
1085 if (!checkLocalhostCodePoint(iterator, 'o'))
1086 return false;
1087 if (!checkLocalhostCodePoint(iterator, 's'))
1088 return false;
1089 if (!checkLocalhostCodePoint(iterator, 't'))
1090 return false;
1091 return iterator.atEnd();
1092}
1093
1094bool URLParser::isLocalhost(StringView view)
1095{
1096 if (view.is8Bit())
1097 return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1098 return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1099}
1100
1101ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1102{
1103 if (UNLIKELY(m_didSeeSyntaxViolation)) {
1104 ASSERT(start + length <= m_asciiBuffer.size());
1105 return StringView(m_asciiBuffer.data() + start, length);
1106 }
1107 ASSERT(start + length <= m_inputString.length());
1108 return StringView(m_inputString).substring(start, length);
1109}
1110
1111ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1112{
1113 if (UNLIKELY(m_didSeeSyntaxViolation))
1114 return m_asciiBuffer[position];
1115 return m_inputString[position];
1116}
1117
1118template<typename CharacterType>
1119ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1120{
1121 if (UNLIKELY(m_didSeeSyntaxViolation))
1122 return m_asciiBuffer.size();
1123
1124 return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1125}
1126
1127URLParser::URLParser(const String& input, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1128 : m_inputString(input)
1129{
1130 if (input.isNull()) {
1131 if (base.isValid() && !base.m_cannotBeABaseURL) {
1132 m_url = base;
1133 m_url.removeFragmentIdentifier();
1134 }
1135 return;
1136 }
1137
1138 if (input.is8Bit()) {
1139 m_inputBegin = input.characters8();
1140 parse(input.characters8(), input.length(), base, nonUTF8QueryEncoding);
1141 } else {
1142 m_inputBegin = input.characters16();
1143 parse(input.characters16(), input.length(), base, nonUTF8QueryEncoding);
1144 }
1145
1146 ASSERT(!m_url.m_isValid
1147 || m_didSeeSyntaxViolation == (m_url.string() != input)
1148 || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1149 && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1150 ASSERT(internalValuesConsistent(m_url));
1151#if !ASSERT_DISABLED
1152 if (!m_didSeeSyntaxViolation) {
1153 // Force a syntax violation at the beginning to make sure we get the same result.
1154 URLParser parser(makeString(" ", input), base, nonUTF8QueryEncoding);
1155 URL parsed = parser.result();
1156 if (parsed.isValid())
1157 ASSERT(allValuesEqual(parser.result(), m_url));
1158 }
1159#endif
1160}
1161
1162template<typename CharacterType>
1163void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1164{
1165 URL_PARSER_LOG("Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1166 m_url = { };
1167 ASSERT(m_asciiBuffer.isEmpty());
1168
1169 Vector<UChar> queryBuffer;
1170
1171 unsigned endIndex = length;
1172 while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1173 syntaxViolation(CodePointIterator<CharacterType>(input, input));
1174 endIndex--;
1175 }
1176 CodePointIterator<CharacterType> c(input, input + endIndex);
1177 CodePointIterator<CharacterType> authorityOrHostBegin;
1178 CodePointIterator<CharacterType> queryBegin;
1179 while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1180 syntaxViolation(c);
1181 ++c;
1182 }
1183 auto beginAfterControlAndSpace = c;
1184
1185 enum class State : uint8_t {
1186 SchemeStart,
1187 Scheme,
1188 NoScheme,
1189 SpecialRelativeOrAuthority,
1190 PathOrAuthority,
1191 Relative,
1192 RelativeSlash,
1193 SpecialAuthoritySlashes,
1194 SpecialAuthorityIgnoreSlashes,
1195 AuthorityOrHost,
1196 Host,
1197 File,
1198 FileSlash,
1199 FileHost,
1200 PathStart,
1201 Path,
1202 CannotBeABaseURLPath,
1203 UTF8Query,
1204 NonUTF8Query,
1205 Fragment,
1206 };
1207
1208#define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1209#define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1210
1211 State state = State::SchemeStart;
1212 while (!c.atEnd()) {
1213 if (UNLIKELY(isTabOrNewline(*c))) {
1214 syntaxViolation(c);
1215 ++c;
1216 continue;
1217 }
1218
1219 switch (state) {
1220 case State::SchemeStart:
1221 LOG_STATE("SchemeStart");
1222 if (isASCIIAlpha(*c)) {
1223 if (UNLIKELY(isASCIIUpper(*c)))
1224 syntaxViolation(c);
1225 appendToASCIIBuffer(toASCIILower(*c));
1226 advance(c);
1227 if (c.atEnd()) {
1228 m_asciiBuffer.clear();
1229 state = State::NoScheme;
1230 c = beginAfterControlAndSpace;
1231 break;
1232 }
1233 state = State::Scheme;
1234 } else
1235 state = State::NoScheme;
1236 break;
1237 case State::Scheme:
1238 LOG_STATE("Scheme");
1239 if (isValidSchemeCharacter(*c)) {
1240 if (UNLIKELY(isASCIIUpper(*c)))
1241 syntaxViolation(c);
1242 appendToASCIIBuffer(toASCIILower(*c));
1243 } else if (*c == ':') {
1244 unsigned schemeEnd = currentPosition(c);
1245 if (schemeEnd > URL::maxSchemeLength) {
1246 failure();
1247 return;
1248 }
1249 m_url.m_schemeEnd = schemeEnd;
1250 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1251 appendToASCIIBuffer(':');
1252 switch (scheme(urlScheme)) {
1253 case Scheme::File:
1254 m_urlIsSpecial = true;
1255 m_urlIsFile = true;
1256 state = State::File;
1257 ++c;
1258 break;
1259 case Scheme::WS:
1260 case Scheme::WSS:
1261 nonUTF8QueryEncoding = nullptr;
1262 m_urlIsSpecial = true;
1263 if (base.protocolIs(urlScheme))
1264 state = State::SpecialRelativeOrAuthority;
1265 else
1266 state = State::SpecialAuthoritySlashes;
1267 ++c;
1268 break;
1269 case Scheme::HTTP:
1270 case Scheme::HTTPS:
1271 m_url.m_protocolIsInHTTPFamily = true;
1272 FALLTHROUGH;
1273 case Scheme::FTP:
1274 m_urlIsSpecial = true;
1275 if (base.protocolIs(urlScheme))
1276 state = State::SpecialRelativeOrAuthority;
1277 else
1278 state = State::SpecialAuthoritySlashes;
1279 ++c;
1280 break;
1281 case Scheme::NonSpecial:
1282 nonUTF8QueryEncoding = nullptr;
1283 auto maybeSlash = c;
1284 advance(maybeSlash);
1285 if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1286 appendToASCIIBuffer('/');
1287 c = maybeSlash;
1288 state = State::PathOrAuthority;
1289 ASSERT(*c == '/');
1290 ++c;
1291 m_url.m_userStart = currentPosition(c);
1292 } else {
1293 ++c;
1294 m_url.m_userStart = currentPosition(c);
1295 m_url.m_userEnd = m_url.m_userStart;
1296 m_url.m_passwordEnd = m_url.m_userStart;
1297 m_url.m_hostEnd = m_url.m_userStart;
1298 m_url.m_portLength = 0;
1299 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1300 m_url.m_cannotBeABaseURL = true;
1301 state = State::CannotBeABaseURLPath;
1302 }
1303 break;
1304 }
1305 break;
1306 } else {
1307 m_asciiBuffer.clear();
1308 state = State::NoScheme;
1309 c = beginAfterControlAndSpace;
1310 break;
1311 }
1312 advance(c);
1313 if (c.atEnd()) {
1314 m_asciiBuffer.clear();
1315 state = State::NoScheme;
1316 c = beginAfterControlAndSpace;
1317 }
1318 break;
1319 case State::NoScheme:
1320 LOG_STATE("NoScheme");
1321 if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1322 failure();
1323 return;
1324 }
1325 if (base.m_cannotBeABaseURL && *c == '#') {
1326 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1327 state = State::Fragment;
1328 appendToASCIIBuffer('#');
1329 ++c;
1330 break;
1331 }
1332 if (!base.protocolIs("file")) {
1333 state = State::Relative;
1334 break;
1335 }
1336 copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1337 appendToASCIIBuffer(':');
1338 state = State::File;
1339 break;
1340 case State::SpecialRelativeOrAuthority:
1341 LOG_STATE("SpecialRelativeOrAuthority");
1342 if (*c == '/') {
1343 appendToASCIIBuffer('/');
1344 advance(c);
1345 if (c.atEnd()) {
1346 failure();
1347 return;
1348 }
1349 if (*c == '/') {
1350 appendToASCIIBuffer('/');
1351 state = State::SpecialAuthorityIgnoreSlashes;
1352 ++c;
1353 } else
1354 state = State::RelativeSlash;
1355 } else
1356 state = State::Relative;
1357 break;
1358 case State::PathOrAuthority:
1359 LOG_STATE("PathOrAuthority");
1360 if (*c == '/') {
1361 appendToASCIIBuffer('/');
1362 state = State::AuthorityOrHost;
1363 advance(c);
1364 m_url.m_userStart = currentPosition(c);
1365 authorityOrHostBegin = c;
1366 } else {
1367 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1368 m_url.m_userStart = currentPosition(c) - 1;
1369 m_url.m_userEnd = m_url.m_userStart;
1370 m_url.m_passwordEnd = m_url.m_userStart;
1371 m_url.m_hostEnd = m_url.m_userStart;
1372 m_url.m_portLength = 0;
1373 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1374 state = State::Path;
1375 }
1376 break;
1377 case State::Relative:
1378 LOG_STATE("Relative");
1379 switch (*c) {
1380 case '/':
1381 case '\\':
1382 state = State::RelativeSlash;
1383 ++c;
1384 break;
1385 case '?':
1386 copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1387 appendToASCIIBuffer('?');
1388 ++c;
1389 if (nonUTF8QueryEncoding) {
1390 queryBegin = c;
1391 state = State::NonUTF8Query;
1392 } else
1393 state = State::UTF8Query;
1394 break;
1395 case '#':
1396 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1397 appendToASCIIBuffer('#');
1398 state = State::Fragment;
1399 ++c;
1400 break;
1401 default:
1402 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1403 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1404 appendToASCIIBuffer('/');
1405 m_url.m_pathAfterLastSlash = currentPosition(c);
1406 }
1407 state = State::Path;
1408 break;
1409 }
1410 break;
1411 case State::RelativeSlash:
1412 LOG_STATE("RelativeSlash");
1413 if (*c == '/' || *c == '\\') {
1414 ++c;
1415 copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1416 appendToASCIIBuffer("://", 3);
1417 if (m_urlIsSpecial)
1418 state = State::SpecialAuthorityIgnoreSlashes;
1419 else {
1420 m_url.m_userStart = currentPosition(c);
1421 state = State::AuthorityOrHost;
1422 authorityOrHostBegin = c;
1423 }
1424 } else {
1425 copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1426 appendToASCIIBuffer('/');
1427 m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + 1;
1428 state = State::Path;
1429 }
1430 break;
1431 case State::SpecialAuthoritySlashes:
1432 LOG_STATE("SpecialAuthoritySlashes");
1433 if (LIKELY(*c == '/' || *c == '\\')) {
1434 if (UNLIKELY(*c == '\\'))
1435 syntaxViolation(c);
1436 appendToASCIIBuffer('/');
1437 advance(c);
1438 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1439 if (UNLIKELY(*c == '\\'))
1440 syntaxViolation(c);
1441 ++c;
1442 appendToASCIIBuffer('/');
1443 } else {
1444 syntaxViolation(c);
1445 appendToASCIIBuffer('/');
1446 }
1447 } else {
1448 syntaxViolation(c);
1449 appendToASCIIBuffer("//", 2);
1450 }
1451 state = State::SpecialAuthorityIgnoreSlashes;
1452 break;
1453 case State::SpecialAuthorityIgnoreSlashes:
1454 LOG_STATE("SpecialAuthorityIgnoreSlashes");
1455 if (*c == '/' || *c == '\\') {
1456 syntaxViolation(c);
1457 ++c;
1458 } else {
1459 m_url.m_userStart = currentPosition(c);
1460 state = State::AuthorityOrHost;
1461 authorityOrHostBegin = c;
1462 }
1463 break;
1464 case State::AuthorityOrHost:
1465 do {
1466 LOG_STATE("AuthorityOrHost");
1467 if (*c == '@') {
1468 auto lastAt = c;
1469 auto findLastAt = c;
1470 while (!findLastAt.atEnd()) {
1471 URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1472 if (*findLastAt == '@')
1473 lastAt = findLastAt;
1474 bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1475 if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1476 break;
1477 ++findLastAt;
1478 }
1479 parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1480 c = lastAt;
1481 advance(c);
1482 authorityOrHostBegin = c;
1483 state = State::Host;
1484 m_hostHasPercentOrNonASCII = false;
1485 break;
1486 }
1487 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1488 if (isSlash || *c == '?' || *c == '#') {
1489 auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1490 if (iterator.atEnd()) {
1491 if (m_urlIsSpecial)
1492 return failure();
1493 m_url.m_userEnd = currentPosition(c);
1494 m_url.m_passwordEnd = m_url.m_userEnd;
1495 m_url.m_hostEnd = m_url.m_userEnd;
1496 m_url.m_portLength = 0;
1497 m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1498 } else {
1499 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1500 m_url.m_passwordEnd = m_url.m_userEnd;
1501 if (!parseHostAndPort(iterator)) {
1502 failure();
1503 return;
1504 }
1505 if (UNLIKELY(!isSlash)) {
1506 if (m_urlIsSpecial) {
1507 syntaxViolation(c);
1508 appendToASCIIBuffer('/');
1509 }
1510 m_url.m_pathAfterLastSlash = currentPosition(c);
1511 }
1512 }
1513 state = State::Path;
1514 break;
1515 }
1516 if (isPercentOrNonASCII(*c))
1517 m_hostHasPercentOrNonASCII = true;
1518 ++c;
1519 } while (!c.atEnd());
1520 break;
1521 case State::Host:
1522 do {
1523 LOG_STATE("Host");
1524 if (*c == '/' || *c == '?' || *c == '#') {
1525 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1526 failure();
1527 return;
1528 }
1529 if (*c == '?' || *c == '#') {
1530 syntaxViolation(c);
1531 appendToASCIIBuffer('/');
1532 m_url.m_pathAfterLastSlash = currentPosition(c);
1533 }
1534 state = State::Path;
1535 break;
1536 }
1537 if (isPercentOrNonASCII(*c))
1538 m_hostHasPercentOrNonASCII = true;
1539 ++c;
1540 } while (!c.atEnd());
1541 break;
1542 case State::File:
1543 LOG_STATE("File");
1544 switch (*c) {
1545 case '\\':
1546 syntaxViolation(c);
1547 FALLTHROUGH;
1548 case '/':
1549 appendToASCIIBuffer('/');
1550 state = State::FileSlash;
1551 ++c;
1552 break;
1553 case '?':
1554 syntaxViolation(c);
1555 if (base.isValid() && base.protocolIs("file")) {
1556 copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1557 appendToASCIIBuffer('?');
1558 ++c;
1559 } else {
1560 appendToASCIIBuffer("///?", 4);
1561 ++c;
1562 m_url.m_userStart = currentPosition(c) - 2;
1563 m_url.m_userEnd = m_url.m_userStart;
1564 m_url.m_passwordEnd = m_url.m_userStart;
1565 m_url.m_hostEnd = m_url.m_userStart;
1566 m_url.m_portLength = 0;
1567 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1568 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1569 }
1570 if (nonUTF8QueryEncoding) {
1571 queryBegin = c;
1572 state = State::NonUTF8Query;
1573 } else
1574 state = State::UTF8Query;
1575 break;
1576 case '#':
1577 syntaxViolation(c);
1578 if (base.isValid() && base.protocolIs("file")) {
1579 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1580 appendToASCIIBuffer('#');
1581 } else {
1582 appendToASCIIBuffer("///#", 4);
1583 m_url.m_userStart = currentPosition(c) - 2;
1584 m_url.m_userEnd = m_url.m_userStart;
1585 m_url.m_passwordEnd = m_url.m_userStart;
1586 m_url.m_hostEnd = m_url.m_userStart;
1587 m_url.m_portLength = 0;
1588 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1589 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1590 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1591 }
1592 state = State::Fragment;
1593 ++c;
1594 break;
1595 default:
1596 syntaxViolation(c);
1597 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1598 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1599 else {
1600 appendToASCIIBuffer("///", 3);
1601 m_url.m_userStart = currentPosition(c) - 1;
1602 m_url.m_userEnd = m_url.m_userStart;
1603 m_url.m_passwordEnd = m_url.m_userStart;
1604 m_url.m_hostEnd = m_url.m_userStart;
1605 m_url.m_portLength = 0;
1606 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1607 if (isWindowsDriveLetter(c))
1608 appendWindowsDriveLetter(c);
1609 }
1610 state = State::Path;
1611 break;
1612 }
1613 break;
1614 case State::FileSlash:
1615 LOG_STATE("FileSlash");
1616 if (LIKELY(*c == '/' || *c == '\\')) {
1617 if (UNLIKELY(*c == '\\'))
1618 syntaxViolation(c);
1619 appendToASCIIBuffer('/');
1620 advance(c);
1621 m_url.m_userStart = currentPosition(c);
1622 m_url.m_userEnd = m_url.m_userStart;
1623 m_url.m_passwordEnd = m_url.m_userStart;
1624 m_url.m_hostEnd = m_url.m_userStart;
1625 m_url.m_portLength = 0;
1626 authorityOrHostBegin = c;
1627 state = State::FileHost;
1628 break;
1629 }
1630 syntaxViolation(c);
1631 appendToASCIIBuffer("//", 2);
1632 m_url.m_userStart = currentPosition(c) - 1;
1633 m_url.m_userEnd = m_url.m_userStart;
1634 m_url.m_passwordEnd = m_url.m_userStart;
1635 m_url.m_hostEnd = m_url.m_userStart;
1636 m_url.m_portLength = 0;
1637 if (isWindowsDriveLetter(c)) {
1638 appendWindowsDriveLetter(c);
1639 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1640 } else if (copyBaseWindowsDriveLetter(base)) {
1641 appendToASCIIBuffer('/');
1642 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1643 } else
1644 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1645 state = State::Path;
1646 break;
1647 case State::FileHost:
1648 do {
1649 LOG_STATE("FileHost");
1650 if (isSlashQuestionOrHash(*c)) {
1651 bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1652 && isWindowsDriveLetter(authorityOrHostBegin);
1653 if (windowsQuirk) {
1654 syntaxViolation(authorityOrHostBegin);
1655 appendToASCIIBuffer('/');
1656 appendWindowsDriveLetter(authorityOrHostBegin);
1657 }
1658 if (windowsQuirk || authorityOrHostBegin == c) {
1659 ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1660 if (UNLIKELY(*c == '?')) {
1661 syntaxViolation(c);
1662 appendToASCIIBuffer("/?", 2);
1663 ++c;
1664 if (nonUTF8QueryEncoding) {
1665 queryBegin = c;
1666 state = State::NonUTF8Query;
1667 } else
1668 state = State::UTF8Query;
1669 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1670 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1671 break;
1672 }
1673 if (UNLIKELY(*c == '#')) {
1674 syntaxViolation(c);
1675 appendToASCIIBuffer("/#", 2);
1676 ++c;
1677 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1678 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1679 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1680 state = State::Fragment;
1681 break;
1682 }
1683 state = State::Path;
1684 break;
1685 }
1686 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1687 failure();
1688 return;
1689 }
1690 if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1691 syntaxViolation(c);
1692 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1693 m_url.m_hostEnd = currentPosition(c);
1694 m_url.m_portLength = 0;
1695 }
1696
1697 state = State::PathStart;
1698 break;
1699 }
1700 if (isPercentOrNonASCII(*c))
1701 m_hostHasPercentOrNonASCII = true;
1702 ++c;
1703 } while (!c.atEnd());
1704 break;
1705 case State::PathStart:
1706 LOG_STATE("PathStart");
1707 if (*c != '/' && *c != '\\') {
1708 syntaxViolation(c);
1709 appendToASCIIBuffer('/');
1710 }
1711 m_url.m_pathAfterLastSlash = currentPosition(c);
1712 state = State::Path;
1713 break;
1714 case State::Path:
1715 LOG_STATE("Path");
1716 if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1717 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1718 syntaxViolation(c);
1719 appendToASCIIBuffer('/');
1720 ++c;
1721 m_url.m_pathAfterLastSlash = currentPosition(c);
1722 break;
1723 }
1724 if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1725 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1726 syntaxViolation(c);
1727 consumeDoubleDotPathSegment(c);
1728 popPath();
1729 break;
1730 }
1731 if (UNLIKELY(isSingleDotPathSegment(c))) {
1732 syntaxViolation(c);
1733 consumeSingleDotPathSegment(c);
1734 break;
1735 }
1736 }
1737 if (*c == '?') {
1738 m_url.m_pathEnd = currentPosition(c);
1739 appendToASCIIBuffer('?');
1740 ++c;
1741 if (nonUTF8QueryEncoding) {
1742 queryBegin = c;
1743 state = State::NonUTF8Query;
1744 } else
1745 state = State::UTF8Query;
1746 break;
1747 }
1748 if (*c == '#') {
1749 m_url.m_pathEnd = currentPosition(c);
1750 m_url.m_queryEnd = m_url.m_pathEnd;
1751 state = State::Fragment;
1752 break;
1753 }
1754 utf8PercentEncode<isInDefaultEncodeSet>(c);
1755 ++c;
1756 break;
1757 case State::CannotBeABaseURLPath:
1758 LOG_STATE("CannotBeABaseURLPath");
1759 if (*c == '?') {
1760 m_url.m_pathEnd = currentPosition(c);
1761 appendToASCIIBuffer('?');
1762 ++c;
1763 if (nonUTF8QueryEncoding) {
1764 queryBegin = c;
1765 state = State::NonUTF8Query;
1766 } else
1767 state = State::UTF8Query;
1768 } else if (*c == '#') {
1769 m_url.m_pathEnd = currentPosition(c);
1770 m_url.m_queryEnd = m_url.m_pathEnd;
1771 state = State::Fragment;
1772 } else if (*c == '/') {
1773 appendToASCIIBuffer('/');
1774 ++c;
1775 m_url.m_pathAfterLastSlash = currentPosition(c);
1776 } else {
1777 utf8PercentEncode<isInSimpleEncodeSet>(c);
1778 ++c;
1779 }
1780 break;
1781 case State::UTF8Query:
1782 LOG_STATE("UTF8Query");
1783 ASSERT(queryBegin == CodePointIterator<CharacterType>());
1784 if (*c == '#') {
1785 m_url.m_queryEnd = currentPosition(c);
1786 state = State::Fragment;
1787 break;
1788 }
1789 ASSERT(!nonUTF8QueryEncoding);
1790 utf8QueryEncode(c);
1791 ++c;
1792 break;
1793 case State::NonUTF8Query:
1794 do {
1795 LOG_STATE("NonUTF8Query");
1796 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1797 if (*c == '#') {
1798 encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
1799 m_url.m_queryEnd = currentPosition(c);
1800 state = State::Fragment;
1801 break;
1802 }
1803 appendCodePoint(queryBuffer, *c);
1804 advance(c, queryBegin);
1805 } while (!c.atEnd());
1806 break;
1807 case State::Fragment:
1808 URL_PARSER_LOG("State Fragment");
1809 utf8PercentEncode<isInSimpleEncodeSet>(c);
1810 ++c;
1811 break;
1812 }
1813 }
1814
1815 switch (state) {
1816 case State::SchemeStart:
1817 LOG_FINAL_STATE("SchemeStart");
1818 if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1819 m_url = base;
1820 m_url.removeFragmentIdentifier();
1821 return;
1822 }
1823 failure();
1824 return;
1825 case State::Scheme:
1826 LOG_FINAL_STATE("Scheme");
1827 failure();
1828 return;
1829 case State::NoScheme:
1830 LOG_FINAL_STATE("NoScheme");
1831 RELEASE_ASSERT_NOT_REACHED();
1832 case State::SpecialRelativeOrAuthority:
1833 LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1834 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1835 break;
1836 case State::PathOrAuthority:
1837 LOG_FINAL_STATE("PathOrAuthority");
1838 ASSERT(m_url.m_userStart);
1839 ASSERT(m_url.m_userStart == currentPosition(c));
1840 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1841 m_url.m_userStart--;
1842 m_url.m_userEnd = m_url.m_userStart;
1843 m_url.m_passwordEnd = m_url.m_userStart;
1844 m_url.m_hostEnd = m_url.m_userStart;
1845 m_url.m_portLength = 0;
1846 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1847 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1848 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1849 break;
1850 case State::Relative:
1851 LOG_FINAL_STATE("Relative");
1852 RELEASE_ASSERT_NOT_REACHED();
1853 case State::RelativeSlash:
1854 LOG_FINAL_STATE("RelativeSlash");
1855 copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1856 appendToASCIIBuffer('/');
1857 m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
1858 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1859 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1860 break;
1861 case State::SpecialAuthoritySlashes:
1862 LOG_FINAL_STATE("SpecialAuthoritySlashes");
1863 m_url.m_userStart = currentPosition(c);
1864 m_url.m_userEnd = m_url.m_userStart;
1865 m_url.m_passwordEnd = m_url.m_userStart;
1866 m_url.m_hostEnd = m_url.m_userStart;
1867 m_url.m_portLength = 0;
1868 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1869 m_url.m_pathEnd = m_url.m_userStart;
1870 m_url.m_queryEnd = m_url.m_userStart;
1871 break;
1872 case State::SpecialAuthorityIgnoreSlashes:
1873 LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1874 failure();
1875 return;
1876 case State::AuthorityOrHost:
1877 LOG_FINAL_STATE("AuthorityOrHost");
1878 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1879 m_url.m_passwordEnd = m_url.m_userEnd;
1880 if (authorityOrHostBegin.atEnd()) {
1881 m_url.m_userEnd = m_url.m_userStart;
1882 m_url.m_passwordEnd = m_url.m_userStart;
1883 m_url.m_hostEnd = m_url.m_userStart;
1884 m_url.m_portLength = 0;
1885 m_url.m_pathEnd = m_url.m_userStart;
1886 } else if (!parseHostAndPort(authorityOrHostBegin)) {
1887 failure();
1888 return;
1889 } else {
1890 if (m_urlIsSpecial) {
1891 syntaxViolation(c);
1892 appendToASCIIBuffer('/');
1893 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1894 } else
1895 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1896 }
1897 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1898 m_url.m_queryEnd = m_url.m_pathEnd;
1899 break;
1900 case State::Host:
1901 LOG_FINAL_STATE("Host");
1902 if (!parseHostAndPort(authorityOrHostBegin)) {
1903 failure();
1904 return;
1905 }
1906 if (m_urlIsSpecial) {
1907 syntaxViolation(c);
1908 appendToASCIIBuffer('/');
1909 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1910 } else
1911 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1912 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1913 m_url.m_queryEnd = m_url.m_pathEnd;
1914 break;
1915 case State::File:
1916 LOG_FINAL_STATE("File");
1917 if (base.isValid() && base.protocolIs("file")) {
1918 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1919 break;
1920 }
1921 syntaxViolation(c);
1922 appendToASCIIBuffer("///", 3);
1923 m_url.m_userStart = currentPosition(c) - 1;
1924 m_url.m_userEnd = m_url.m_userStart;
1925 m_url.m_passwordEnd = m_url.m_userStart;
1926 m_url.m_hostEnd = m_url.m_userStart;
1927 m_url.m_portLength = 0;
1928 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1929 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1930 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1931 break;
1932 case State::FileSlash:
1933 LOG_FINAL_STATE("FileSlash");
1934 syntaxViolation(c);
1935 m_url.m_userStart = currentPosition(c) + 1;
1936 appendToASCIIBuffer("//", 2);
1937 m_url.m_userEnd = m_url.m_userStart;
1938 m_url.m_passwordEnd = m_url.m_userStart;
1939 m_url.m_hostEnd = m_url.m_userStart;
1940 m_url.m_portLength = 0;
1941 if (copyBaseWindowsDriveLetter(base)) {
1942 appendToASCIIBuffer('/');
1943 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1944 } else
1945 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1946 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1947 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1948 break;
1949 case State::FileHost:
1950 LOG_FINAL_STATE("FileHost");
1951 if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1952 && isWindowsDriveLetter(authorityOrHostBegin)) {
1953 syntaxViolation(authorityOrHostBegin);
1954 appendToASCIIBuffer('/');
1955 appendWindowsDriveLetter(authorityOrHostBegin);
1956 m_url.m_pathAfterLastSlash = currentPosition(c);
1957 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1958 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1959 break;
1960 }
1961
1962 if (authorityOrHostBegin == c) {
1963 syntaxViolation(c);
1964 appendToASCIIBuffer('/');
1965 m_url.m_userStart = currentPosition(c) - 1;
1966 m_url.m_userEnd = m_url.m_userStart;
1967 m_url.m_passwordEnd = m_url.m_userStart;
1968 m_url.m_hostEnd = m_url.m_userStart;
1969 m_url.m_portLength = 0;
1970 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1971 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1972 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1973 break;
1974 }
1975
1976 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1977 failure();
1978 return;
1979 }
1980
1981 syntaxViolation(c);
1982 if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1983 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1984 m_url.m_hostEnd = currentPosition(c);
1985 m_url.m_portLength = 0;
1986 }
1987 appendToASCIIBuffer('/');
1988 m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
1989 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1990 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1991 break;
1992 case State::PathStart:
1993 LOG_FINAL_STATE("PathStart");
1994 RELEASE_ASSERT_NOT_REACHED();
1995 case State::Path:
1996 LOG_FINAL_STATE("Path");
1997 m_url.m_pathEnd = currentPosition(c);
1998 m_url.m_queryEnd = m_url.m_pathEnd;
1999 break;
2000 case State::CannotBeABaseURLPath:
2001 LOG_FINAL_STATE("CannotBeABaseURLPath");
2002 m_url.m_pathEnd = currentPosition(c);
2003 m_url.m_queryEnd = m_url.m_pathEnd;
2004 break;
2005 case State::UTF8Query:
2006 LOG_FINAL_STATE("UTF8Query");
2007 ASSERT(queryBegin == CodePointIterator<CharacterType>());
2008 m_url.m_queryEnd = currentPosition(c);
2009 break;
2010 case State::NonUTF8Query:
2011 LOG_FINAL_STATE("NonUTF8Query");
2012 ASSERT(queryBegin != CodePointIterator<CharacterType>());
2013 encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
2014 m_url.m_queryEnd = currentPosition(c);
2015 break;
2016 case State::Fragment:
2017 LOG_FINAL_STATE("Fragment");
2018 break;
2019 }
2020
2021 if (LIKELY(!m_didSeeSyntaxViolation)) {
2022 m_url.m_string = m_inputString;
2023 ASSERT(m_asciiBuffer.isEmpty());
2024 } else
2025 m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2026 m_url.m_isValid = true;
2027 URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2028}
2029
2030template<typename CharacterType>
2031void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2032{
2033 if (UNLIKELY(iterator.atEnd())) {
2034 syntaxViolation(iterator);
2035 m_url.m_userEnd = currentPosition(iterator);
2036 m_url.m_passwordEnd = m_url.m_userEnd;
2037 return;
2038 }
2039 for (; !iterator.atEnd(); advance(iterator)) {
2040 if (*iterator == ':') {
2041 m_url.m_userEnd = currentPosition(iterator);
2042 auto iteratorAtColon = iterator;
2043 ++iterator;
2044 bool tabOrNewlineAfterColon = false;
2045 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2046 tabOrNewlineAfterColon = true;
2047 ++iterator;
2048 }
2049 if (UNLIKELY(iterator.atEnd())) {
2050 syntaxViolation(iteratorAtColon);
2051 m_url.m_passwordEnd = m_url.m_userEnd;
2052 if (m_url.m_userEnd > m_url.m_userStart)
2053 appendToASCIIBuffer('@');
2054 return;
2055 }
2056 if (tabOrNewlineAfterColon)
2057 syntaxViolation(iteratorAtColon);
2058 appendToASCIIBuffer(':');
2059 break;
2060 }
2061 utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2062 }
2063 for (; !iterator.atEnd(); advance(iterator))
2064 utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2065 m_url.m_passwordEnd = currentPosition(iterator);
2066 if (!m_url.m_userEnd)
2067 m_url.m_userEnd = m_url.m_passwordEnd;
2068 appendToASCIIBuffer('@');
2069}
2070
2071template<typename UnsignedIntegerType>
2072void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2073{
2074 LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2075 LChar* end = std::end(buf);
2076 LChar* p = end;
2077 do {
2078 *--p = (number % 10) + '0';
2079 number /= 10;
2080 } while (number);
2081 appendToASCIIBuffer(p, end - p);
2082}
2083
2084void URLParser::serializeIPv4(IPv4Address address)
2085{
2086 appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2087 appendToASCIIBuffer('.');
2088 appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2089 appendToASCIIBuffer('.');
2090 appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2091 appendToASCIIBuffer('.');
2092 appendNumberToASCIIBuffer<uint8_t>(address);
2093}
2094
2095static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2096{
2097 size_t end = begin;
2098 for (; end < 8; end++) {
2099 if (address[end])
2100 break;
2101 }
2102 return end - begin;
2103}
2104
2105static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2106{
2107 Optional<size_t> longest;
2108 size_t longestLength = 0;
2109 for (size_t i = 0; i < 8; i++) {
2110 size_t length = zeroSequenceLength(address, i);
2111 if (length) {
2112 if (length > 1 && (!longest || longestLength < length)) {
2113 longest = i;
2114 longestLength = length;
2115 }
2116 i += length;
2117 }
2118 }
2119 return longest;
2120}
2121
2122void URLParser::serializeIPv6Piece(uint16_t piece)
2123{
2124 bool printed = false;
2125 if (auto nibble0 = piece >> 12) {
2126 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2127 printed = true;
2128 }
2129 auto nibble1 = piece >> 8 & 0xF;
2130 if (printed || nibble1) {
2131 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2132 printed = true;
2133 }
2134 auto nibble2 = piece >> 4 & 0xF;
2135 if (printed || nibble2)
2136 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2137 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2138}
2139
2140void URLParser::serializeIPv6(URLParser::IPv6Address address)
2141{
2142 appendToASCIIBuffer('[');
2143 auto compressPointer = findLongestZeroSequence(address);
2144 for (size_t piece = 0; piece < 8; piece++) {
2145 if (compressPointer && compressPointer.value() == piece) {
2146 ASSERT(!address[piece]);
2147 if (piece)
2148 appendToASCIIBuffer(':');
2149 else
2150 appendToASCIIBuffer("::", 2);
2151 while (piece < 8 && !address[piece])
2152 piece++;
2153 if (piece == 8)
2154 break;
2155 }
2156 serializeIPv6Piece(address[piece]);
2157 if (piece < 7)
2158 appendToASCIIBuffer(':');
2159 }
2160 appendToASCIIBuffer(']');
2161}
2162
2163enum class URLParser::IPv4PieceParsingError {
2164 Failure,
2165 Overflow,
2166};
2167
2168template<typename CharacterType>
2169Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2170{
2171 enum class State : uint8_t {
2172 UnknownBase,
2173 Decimal,
2174 OctalOrHex,
2175 Octal,
2176 Hex,
2177 };
2178 State state = State::UnknownBase;
2179 Checked<uint32_t, RecordOverflow> value = 0;
2180 if (!iterator.atEnd() && *iterator == '.')
2181 return makeUnexpected(IPv4PieceParsingError::Failure);
2182 while (!iterator.atEnd()) {
2183 if (isTabOrNewline(*iterator)) {
2184 didSeeSyntaxViolation = true;
2185 ++iterator;
2186 continue;
2187 }
2188 if (*iterator == '.') {
2189 ASSERT(!value.hasOverflowed());
2190 return value.unsafeGet();
2191 }
2192 switch (state) {
2193 case State::UnknownBase:
2194 if (UNLIKELY(*iterator == '0')) {
2195 ++iterator;
2196 state = State::OctalOrHex;
2197 break;
2198 }
2199 state = State::Decimal;
2200 break;
2201 case State::OctalOrHex:
2202 didSeeSyntaxViolation = true;
2203 if (*iterator == 'x' || *iterator == 'X') {
2204 ++iterator;
2205 state = State::Hex;
2206 break;
2207 }
2208 state = State::Octal;
2209 break;
2210 case State::Decimal:
2211 if (!isASCIIDigit(*iterator))
2212 return makeUnexpected(IPv4PieceParsingError::Failure);
2213 value *= 10;
2214 value += *iterator - '0';
2215 if (UNLIKELY(value.hasOverflowed()))
2216 return makeUnexpected(IPv4PieceParsingError::Overflow);
2217 ++iterator;
2218 break;
2219 case State::Octal:
2220 ASSERT(didSeeSyntaxViolation);
2221 if (*iterator < '0' || *iterator > '7')
2222 return makeUnexpected(IPv4PieceParsingError::Failure);
2223 value *= 8;
2224 value += *iterator - '0';
2225 if (UNLIKELY(value.hasOverflowed()))
2226 return makeUnexpected(IPv4PieceParsingError::Overflow);
2227 ++iterator;
2228 break;
2229 case State::Hex:
2230 ASSERT(didSeeSyntaxViolation);
2231 if (!isASCIIHexDigit(*iterator))
2232 return makeUnexpected(IPv4PieceParsingError::Failure);
2233 value *= 16;
2234 value += toASCIIHexValue(*iterator);
2235 if (UNLIKELY(value.hasOverflowed()))
2236 return makeUnexpected(IPv4PieceParsingError::Overflow);
2237 ++iterator;
2238 break;
2239 }
2240 }
2241 ASSERT(!value.hasOverflowed());
2242 return value.unsafeGet();
2243}
2244
2245ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2246{
2247 RELEASE_ASSERT(exponent <= 4);
2248 uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2249 return values[exponent];
2250}
2251
2252enum class URLParser::IPv4ParsingError {
2253 Failure,
2254 NotIPv4,
2255};
2256
2257template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2258Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2259{
2260 Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2261 bool didSeeSyntaxViolation = false;
2262 if (!iterator.atEnd() && *iterator == '.')
2263 return makeUnexpected(IPv4ParsingError::NotIPv4);
2264 while (!iterator.atEnd()) {
2265 if (isTabOrNewline(*iterator)) {
2266 didSeeSyntaxViolation = true;
2267 ++iterator;
2268 continue;
2269 }
2270 if (items.size() >= 4)
2271 return makeUnexpected(IPv4ParsingError::NotIPv4);
2272 items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2273 if (!iterator.atEnd() && *iterator == '.') {
2274 ++iterator;
2275 if (iterator.atEnd())
2276 didSeeSyntaxViolation = true;
2277 else if (*iterator == '.')
2278 return makeUnexpected(IPv4ParsingError::NotIPv4);
2279 }
2280 }
2281 if (!iterator.atEnd() || !items.size() || items.size() > 4)
2282 return makeUnexpected(IPv4ParsingError::NotIPv4);
2283 for (const auto& item : items) {
2284 if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2285 return makeUnexpected(IPv4ParsingError::NotIPv4);
2286 }
2287 for (const auto& item : items) {
2288 if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2289 return makeUnexpected(IPv4ParsingError::Failure);
2290 }
2291 if (items.size() > 1) {
2292 for (size_t i = 0; i < items.size() - 1; i++) {
2293 if (items[i].value() > 255)
2294 return makeUnexpected(IPv4ParsingError::Failure);
2295 }
2296 }
2297 if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2298 return makeUnexpected(IPv4ParsingError::Failure);
2299
2300 if (didSeeSyntaxViolation)
2301 syntaxViolation(iteratorForSyntaxViolationPosition);
2302 for (const auto& item : items) {
2303 if (item.value() > 255)
2304 syntaxViolation(iteratorForSyntaxViolationPosition);
2305 }
2306
2307 if (UNLIKELY(items.size() != 4))
2308 syntaxViolation(iteratorForSyntaxViolationPosition);
2309
2310 IPv4Address ipv4 = items.takeLast().value();
2311 for (size_t counter = 0; counter < items.size(); ++counter)
2312 ipv4 += items[counter].value() * pow256(3 - counter);
2313 return ipv4;
2314}
2315
2316template<typename CharacterType>
2317Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2318{
2319 if (iterator.atEnd())
2320 return WTF::nullopt;
2321 uint32_t piece = 0;
2322 bool leadingZeros = false;
2323 size_t digitCount = 0;
2324 while (!iterator.atEnd()) {
2325 if (!isASCIIDigit(*iterator))
2326 return WTF::nullopt;
2327 ++digitCount;
2328 if (!piece && *iterator == '0') {
2329 if (leadingZeros)
2330 return WTF::nullopt;
2331 leadingZeros = true;
2332 }
2333 if (!piece && *iterator == '0')
2334 leadingZeros = true;
2335 piece = piece * 10 + *iterator - '0';
2336 if (piece > 255)
2337 return WTF::nullopt;
2338 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2339 if (iterator.atEnd())
2340 break;
2341 if (*iterator == '.')
2342 break;
2343 }
2344 if (piece && leadingZeros)
2345 return WTF::nullopt;
2346 return piece;
2347}
2348
2349template<typename CharacterType>
2350Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2351{
2352 IPv4Address address = 0;
2353 for (size_t i = 0; i < 4; ++i) {
2354 if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2355 address = (address << 8) + piece.value();
2356 else
2357 return WTF::nullopt;
2358 if (i < 3) {
2359 if (iterator.atEnd())
2360 return WTF::nullopt;
2361 if (*iterator != '.')
2362 return WTF::nullopt;
2363 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2364 } else if (!iterator.atEnd())
2365 return WTF::nullopt;
2366 }
2367 ASSERT(iterator.atEnd());
2368 return address;
2369}
2370
2371template<typename CharacterType>
2372Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2373{
2374 ASSERT(*c == '[');
2375 const auto hostBegin = c;
2376 advance(c, hostBegin);
2377 if (c.atEnd())
2378 return WTF::nullopt;
2379
2380 IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2381 size_t piecePointer = 0;
2382 Optional<size_t> compressPointer;
2383 bool previousValueWasZero = false;
2384 bool immediatelyAfterCompress = false;
2385
2386 if (*c == ':') {
2387 advance(c, hostBegin);
2388 if (c.atEnd())
2389 return WTF::nullopt;
2390 if (*c != ':')
2391 return WTF::nullopt;
2392 advance(c, hostBegin);
2393 ++piecePointer;
2394 compressPointer = piecePointer;
2395 immediatelyAfterCompress = true;
2396 }
2397
2398 while (!c.atEnd()) {
2399 if (piecePointer == 8)
2400 return WTF::nullopt;
2401 if (*c == ':') {
2402 if (compressPointer)
2403 return WTF::nullopt;
2404 advance(c, hostBegin);
2405 ++piecePointer;
2406 compressPointer = piecePointer;
2407 immediatelyAfterCompress = true;
2408 if (previousValueWasZero)
2409 syntaxViolation(hostBegin);
2410 continue;
2411 }
2412 if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2413 if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2414 if (compressPointer && piecePointer == 5)
2415 return WTF::nullopt;
2416 syntaxViolation(hostBegin);
2417 address[piecePointer++] = ipv4Address.value() >> 16;
2418 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2419 c = { };
2420 break;
2421 }
2422 }
2423 uint16_t value = 0;
2424 size_t length = 0;
2425 bool leadingZeros = false;
2426 for (; length < 4; length++) {
2427 if (c.atEnd())
2428 break;
2429 if (!isASCIIHexDigit(*c))
2430 break;
2431 if (isASCIIUpper(*c))
2432 syntaxViolation(hostBegin);
2433 if (*c == '0' && !length)
2434 leadingZeros = true;
2435 value = value * 0x10 + toASCIIHexValue(*c);
2436 advance(c, hostBegin);
2437 }
2438
2439 previousValueWasZero = !value;
2440 if (UNLIKELY((value && leadingZeros) || (previousValueWasZero && (length > 1 || immediatelyAfterCompress))))
2441 syntaxViolation(hostBegin);
2442
2443 address[piecePointer++] = value;
2444 if (c.atEnd())
2445 break;
2446 if (piecePointer == 8 || *c != ':')
2447 return WTF::nullopt;
2448 advance(c, hostBegin);
2449 if (c.atEnd())
2450 syntaxViolation(hostBegin);
2451
2452 immediatelyAfterCompress = false;
2453 }
2454
2455 if (!c.atEnd())
2456 return WTF::nullopt;
2457
2458 if (compressPointer) {
2459 size_t swaps = piecePointer - compressPointer.value();
2460 piecePointer = 7;
2461 while (swaps)
2462 std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2463 } else if (piecePointer != 8)
2464 return WTF::nullopt;
2465
2466 Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2467 if (possibleCompressPointer)
2468 possibleCompressPointer.value()++;
2469 if (UNLIKELY(compressPointer != possibleCompressPointer))
2470 syntaxViolation(hostBegin);
2471
2472 return address;
2473}
2474
2475template<typename CharacterType>
2476URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2477{
2478 LCharBuffer output;
2479 output.reserveInitialCapacity(length);
2480
2481 for (size_t i = 0; i < length; ++i) {
2482 uint8_t byte = input[i];
2483 if (byte != '%')
2484 output.uncheckedAppend(byte);
2485 else if (length > 2 && i < length - 2) {
2486 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2487 syntaxViolation(iteratorForSyntaxViolationPosition);
2488 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2489 i += 2;
2490 } else
2491 output.uncheckedAppend(byte);
2492 } else
2493 output.uncheckedAppend(byte);
2494 }
2495 return output;
2496}
2497
2498URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2499{
2500 LCharBuffer output;
2501 output.reserveInitialCapacity(length);
2502
2503 for (size_t i = 0; i < length; ++i) {
2504 uint8_t byte = input[i];
2505 if (byte != '%')
2506 output.uncheckedAppend(byte);
2507 else if (length > 2 && i < length - 2) {
2508 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2509 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2510 i += 2;
2511 } else
2512 output.uncheckedAppend(byte);
2513 } else
2514 output.uncheckedAppend(byte);
2515 }
2516 return output;
2517}
2518
2519template<typename CharacterType> Optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2520{
2521 LCharBuffer ascii;
2522 if (domain.isAllASCII()) {
2523 size_t length = domain.length();
2524 if (domain.is8Bit()) {
2525 const LChar* characters = domain.characters8();
2526 ascii.reserveInitialCapacity(length);
2527 for (size_t i = 0; i < length; ++i) {
2528 if (UNLIKELY(isASCIIUpper(characters[i])))
2529 syntaxViolation(iteratorForSyntaxViolationPosition);
2530 ascii.uncheckedAppend(toASCIILower(characters[i]));
2531 }
2532 } else {
2533 const UChar* characters = domain.characters16();
2534 ascii.reserveInitialCapacity(length);
2535 for (size_t i = 0; i < length; ++i) {
2536 if (UNLIKELY(isASCIIUpper(characters[i])))
2537 syntaxViolation(iteratorForSyntaxViolationPosition);
2538 ascii.uncheckedAppend(toASCIILower(characters[i]));
2539 }
2540 }
2541 return ascii;
2542 }
2543
2544 const size_t maxDomainLength = 64;
2545 UChar hostnameBuffer[maxDomainLength];
2546 UErrorCode error = U_ZERO_ERROR;
2547 UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2548 int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, maxDomainLength, &processingDetails, &error);
2549
2550 if (U_SUCCESS(error) && !processingDetails.errors) {
2551#if ASSERT_DISABLED
2552 UNUSED_PARAM(numCharactersConverted);
2553#else
2554 for (int32_t i = 0; i < numCharactersConverted; ++i) {
2555 ASSERT(isASCII(hostnameBuffer[i]));
2556 ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2557 }
2558#endif
2559 ascii.append(hostnameBuffer, numCharactersConverted);
2560 if (domain != StringView(ascii.data(), ascii.size()))
2561 syntaxViolation(iteratorForSyntaxViolationPosition);
2562 return ascii;
2563 }
2564 return WTF::nullopt;
2565}
2566
2567bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2568{
2569 for (size_t i = 0; i < asciiDomain.size(); ++i) {
2570 if (isForbiddenHostCodePoint(asciiDomain[i]))
2571 return true;
2572 }
2573 return false;
2574}
2575
2576template<typename CharacterType>
2577bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2578{
2579 ASSERT(*iterator == ':');
2580 auto colonIterator = iterator;
2581 advance(iterator, colonIterator);
2582 uint32_t port = 0;
2583 if (UNLIKELY(iterator.atEnd())) {
2584 unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd;
2585 RELEASE_ASSERT(portLength <= URL::maxPortLength);
2586 m_url.m_portLength = portLength;
2587 syntaxViolation(colonIterator);
2588 return true;
2589 }
2590 size_t digitCount = 0;
2591 bool leadingZeros = false;
2592 for (; !iterator.atEnd(); ++iterator) {
2593 if (UNLIKELY(isTabOrNewline(*iterator))) {
2594 syntaxViolation(colonIterator);
2595 continue;
2596 }
2597 if (isASCIIDigit(*iterator)) {
2598 if (*iterator == '0' && !digitCount)
2599 leadingZeros = true;
2600 ++digitCount;
2601 port = port * 10 + *iterator - '0';
2602 if (port > std::numeric_limits<uint16_t>::max())
2603 return false;
2604 } else
2605 return false;
2606 }
2607
2608 if (port && leadingZeros)
2609 syntaxViolation(colonIterator);
2610
2611 if (!port && digitCount > 1)
2612 syntaxViolation(colonIterator);
2613
2614 ASSERT(port == static_cast<uint16_t>(port));
2615 if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2616 syntaxViolation(colonIterator);
2617 else {
2618 appendToASCIIBuffer(':');
2619 ASSERT(port <= std::numeric_limits<uint16_t>::max());
2620 appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2621 }
2622
2623 unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2624 RELEASE_ASSERT(portLength <= URL::maxPortLength);
2625 m_url.m_portLength = portLength;
2626 return true;
2627}
2628
2629template<typename CharacterType>
2630bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2631{
2632 if (iterator.atEnd())
2633 return false;
2634 if (*iterator == ':')
2635 return false;
2636 if (*iterator == '[') {
2637 auto ipv6End = iterator;
2638 while (!ipv6End.atEnd() && *ipv6End != ']')
2639 ++ipv6End;
2640 if (ipv6End.atEnd())
2641 return false;
2642 if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2643 serializeIPv6(address.value());
2644 if (!ipv6End.atEnd()) {
2645 advance(ipv6End);
2646 m_url.m_hostEnd = currentPosition(ipv6End);
2647 if (!ipv6End.atEnd() && *ipv6End == ':')
2648 return parsePort(ipv6End);
2649 m_url.m_portLength = 0;
2650 return ipv6End.atEnd();
2651 }
2652 m_url.m_hostEnd = currentPosition(ipv6End);
2653 return true;
2654 }
2655 return false;
2656 }
2657
2658 if (!m_urlIsSpecial) {
2659 for (; !iterator.atEnd(); ++iterator) {
2660 if (UNLIKELY(isTabOrNewline(*iterator))) {
2661 syntaxViolation(iterator);
2662 continue;
2663 }
2664 if (*iterator == ':')
2665 break;
2666 if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2667 return false;
2668 utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2669 }
2670 m_url.m_hostEnd = currentPosition(iterator);
2671 if (iterator.atEnd()) {
2672 m_url.m_portLength = 0;
2673 return true;
2674 }
2675 return parsePort(iterator);
2676 }
2677
2678 if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2679 auto hostIterator = iterator;
2680 for (; !iterator.atEnd(); ++iterator) {
2681 if (isTabOrNewline(*iterator))
2682 continue;
2683 if (*iterator == ':')
2684 break;
2685 if (isForbiddenHostCodePoint(*iterator))
2686 return false;
2687 }
2688 auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2689 if (address) {
2690 serializeIPv4(address.value());
2691 m_url.m_hostEnd = currentPosition(iterator);
2692 if (iterator.atEnd()) {
2693 m_url.m_portLength = 0;
2694 return true;
2695 }
2696 return parsePort(iterator);
2697 }
2698 if (address.error() == IPv4ParsingError::Failure)
2699 return false;
2700 for (; hostIterator != iterator; ++hostIterator) {
2701 if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2702 syntaxViolation(hostIterator);
2703 continue;
2704 }
2705 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2706 syntaxViolation(hostIterator);
2707 appendToASCIIBuffer(toASCIILower(*hostIterator));
2708 }
2709 m_url.m_hostEnd = currentPosition(iterator);
2710 if (!hostIterator.atEnd())
2711 return parsePort(hostIterator);
2712 unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2713 RELEASE_ASSERT(portLength <= URL::maxPortLength);
2714 m_url.m_portLength = portLength;
2715 return true;
2716 }
2717
2718 const auto hostBegin = iterator;
2719
2720 LCharBuffer utf8Encoded;
2721 for (; !iterator.atEnd(); ++iterator) {
2722 if (UNLIKELY(isTabOrNewline(*iterator))) {
2723 syntaxViolation(hostBegin);
2724 continue;
2725 }
2726 if (*iterator == ':')
2727 break;
2728 if (UNLIKELY(!isASCII(*iterator)))
2729 syntaxViolation(hostBegin);
2730
2731 if (!U_IS_UNICODE_CHAR(*iterator))
2732 return false;
2733 uint8_t buffer[U8_MAX_LENGTH];
2734 int32_t offset = 0;
2735 U8_APPEND_UNSAFE(buffer, offset, *iterator);
2736 utf8Encoded.append(buffer, offset);
2737 }
2738 LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2739 String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2740 if (domain.isNull())
2741 return false;
2742 if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2743 syntaxViolation(hostBegin);
2744 auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2745 if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2746 return false;
2747 LCharBuffer& asciiDomainValue = asciiDomain.value();
2748 const LChar* asciiDomainCharacters = asciiDomainValue.data();
2749
2750 auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2751 if (address) {
2752 serializeIPv4(address.value());
2753 m_url.m_hostEnd = currentPosition(iterator);
2754 if (iterator.atEnd()) {
2755 m_url.m_portLength = 0;
2756 return true;
2757 }
2758 return parsePort(iterator);
2759 }
2760 if (address.error() == IPv4ParsingError::Failure)
2761 return false;
2762
2763 appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2764 m_url.m_hostEnd = currentPosition(iterator);
2765 if (!iterator.atEnd())
2766 return parsePort(iterator);
2767 m_url.m_portLength = 0;
2768 return true;
2769}
2770
2771Optional<String> URLParser::formURLDecode(StringView input)
2772{
2773 auto utf8 = input.utf8(StrictConversion);
2774 if (utf8.isNull())
2775 return WTF::nullopt;
2776 auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2777 return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2778}
2779
2780// https://url.spec.whatwg.org/#concept-urlencoded-parser
2781auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2782{
2783 URLEncodedForm output;
2784 for (StringView bytes : input.split('&')) {
2785 auto equalIndex = bytes.find('=');
2786 if (equalIndex == notFound) {
2787 auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2788 if (name)
2789 output.append({ name.value(), emptyString() });
2790 } else {
2791 auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2792 auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2793 if (name && value)
2794 output.append({ name.value(), value.value() });
2795 }
2796 }
2797 return output;
2798}
2799
2800static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2801{
2802 auto utf8 = input.utf8(StrictConversion);
2803 const char* data = utf8.data();
2804 for (size_t i = 0; i < utf8.length(); ++i) {
2805 const char byte = data[i];
2806 if (byte == 0x20)
2807 output.append(0x2B);
2808 else if (byte == 0x2A
2809 || byte == 0x2D
2810 || byte == 0x2E
2811 || (byte >= 0x30 && byte <= 0x39)
2812 || (byte >= 0x41 && byte <= 0x5A)
2813 || byte == 0x5F
2814 || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2815 output.append(byte);
2816 else
2817 percentEncodeByte(byte, output);
2818 }
2819}
2820
2821String URLParser::serialize(const URLEncodedForm& tuples)
2822{
2823 if (tuples.isEmpty())
2824 return { };
2825
2826 Vector<LChar> output;
2827 for (auto& tuple : tuples) {
2828 if (!output.isEmpty())
2829 output.append('&');
2830 serializeURLEncodedForm(tuple.key, output);
2831 output.append('=');
2832 serializeURLEncodedForm(tuple.value, output);
2833 }
2834 return String::adopt(WTFMove(output));
2835}
2836
2837const UIDNA& URLParser::internationalDomainNameTranscoder()
2838{
2839 static UIDNA* encoder;
2840 static std::once_flag onceFlag;
2841 std::call_once(onceFlag, [] {
2842 UErrorCode error = U_ZERO_ERROR;
2843 encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2844 RELEASE_ASSERT(U_SUCCESS(error));
2845 RELEASE_ASSERT(encoder);
2846 });
2847 return *encoder;
2848}
2849
2850bool URLParser::allValuesEqual(const URL& a, const URL& b)
2851{
2852 URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2853 a.m_isValid,
2854 a.m_cannotBeABaseURL,
2855 a.m_protocolIsInHTTPFamily,
2856 a.m_schemeEnd,
2857 a.m_userStart,
2858 a.m_userEnd,
2859 a.m_passwordEnd,
2860 a.m_hostEnd,
2861 a.m_hostEnd + a.m_portLength,
2862 a.m_pathAfterLastSlash,
2863 a.m_pathEnd,
2864 a.m_queryEnd,
2865 a.m_string.utf8().data(),
2866 b.m_isValid,
2867 b.m_cannotBeABaseURL,
2868 b.m_protocolIsInHTTPFamily,
2869 b.m_schemeEnd,
2870 b.m_userStart,
2871 b.m_userEnd,
2872 b.m_passwordEnd,
2873 b.m_hostEnd,
2874 b.m_hostEnd + b.m_portLength,
2875 b.m_pathAfterLastSlash,
2876 b.m_pathEnd,
2877 b.m_queryEnd,
2878 b.m_string.utf8().data());
2879
2880 return a.m_string == b.m_string
2881 && a.m_isValid == b.m_isValid
2882 && a.m_cannotBeABaseURL == b.m_cannotBeABaseURL
2883 && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2884 && a.m_schemeEnd == b.m_schemeEnd
2885 && a.m_userStart == b.m_userStart
2886 && a.m_userEnd == b.m_userEnd
2887 && a.m_passwordEnd == b.m_passwordEnd
2888 && a.m_hostEnd == b.m_hostEnd
2889 && a.m_portLength == b.m_portLength
2890 && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2891 && a.m_pathEnd == b.m_pathEnd
2892 && a.m_queryEnd == b.m_queryEnd;
2893}
2894
2895bool URLParser::internalValuesConsistent(const URL& url)
2896{
2897 return url.m_schemeEnd <= url.m_userStart
2898 && url.m_userStart <= url.m_userEnd
2899 && url.m_userEnd <= url.m_passwordEnd
2900 && url.m_passwordEnd <= url.m_hostEnd
2901 && url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash
2902 && url.m_pathAfterLastSlash <= url.m_pathEnd
2903 && url.m_pathEnd <= url.m_queryEnd
2904 && url.m_queryEnd <= url.m_string.length();
2905}
2906
2907} // namespace WTF
2908