1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import core.cpuid : sse42; 10 version (D_InlineAsm_X86_64) 11 { 12 version (Windows) {} 13 else version = iasm64NotWindows; 14 } 15 16 /// Operators 17 private enum operators = [ 18 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 19 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 20 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 21 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 22 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 23 ]; 24 25 /// Kewords 26 private enum keywords = [ 27 "abstract", "alias", "align", "asm", "assert", "auto", "bool", 28 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 29 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 30 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 31 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 32 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 33 "immutable", "import", "in", "inout", "int", "interface", "invariant", 34 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 35 "null", "out", "override", "package", "pragma", "private", "protected", 36 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 37 "static", "struct", "super", "switch", "synchronized", "template", "this", 38 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 39 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 40 "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 41 "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters", 42 "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector", 43 "__VENDOR__", "__VERSION__" 44 ]; 45 46 /// Other tokens 47 private enum dynamicTokens = [ 48 "specialTokenSequence", "comment", "identifier", "scriptLine", 49 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 50 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 51 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 52 "dstringLiteral", "stringLiteral", "wstringLiteral" 53 ]; 54 55 private enum pseudoTokenHandlers = [ 56 "\"", "lexStringLiteral", 57 "`", "lexWysiwygString", 58 "//", "lexSlashSlashComment", 59 "/*", "lexSlashStarComment", 60 "/+", "lexSlashPlusComment", 61 ".", "lexDot", 62 "'", "lexCharacterLiteral", 63 "0", "lexNumber", 64 "1", "lexDecimal", 65 "2", "lexDecimal", 66 "3", "lexDecimal", 67 "4", "lexDecimal", 68 "5", "lexDecimal", 69 "6", "lexDecimal", 70 "7", "lexDecimal", 71 "8", "lexDecimal", 72 "9", "lexDecimal", 73 "q\"", "lexDelimitedString", 74 "q{", "lexTokenString", 75 "r\"", "lexWysiwygString", 76 "x\"", "lexHexString", 77 " ", "lexWhitespace", 78 "\t", "lexWhitespace", 79 "\r", "lexWhitespace", 80 "\n", "lexWhitespace", 81 "\v", "lexWhitespace", 82 "\f", "lexWhitespace", 83 "\u2028", "lexLongNewline", 84 "\u2029", "lexLongNewline", 85 "#!", "lexScriptLine", 86 "#line", "lexSpecialTokenSequence" 87 ]; 88 89 /// Token ID type for the D lexer. 90 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 91 92 /** 93 * Function used for converting an IdType to a string. 94 * 95 * Examples: 96 * --- 97 * IdType c = tok!"case"; 98 * assert (str(c) == "case"); 99 * --- 100 */ 101 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 102 103 /** 104 * Template used to refer to D token types. 105 * 106 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 107 * values that can be passed to this template. 108 * Example: 109 * --- 110 * import dparse.lexer; 111 * IdType t = tok!"floatLiteral"; 112 * --- 113 */ 114 public template tok(string token) 115 { 116 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 117 } 118 119 private enum extraFields = q{ 120 string comment; 121 string trailingComment; 122 123 int opCmp(size_t i) const pure nothrow @safe { 124 if (index < i) return -1; 125 if (index > i) return 1; 126 return 0; 127 } 128 129 int opCmp(ref const typeof(this) other) const pure nothrow @safe { 130 return opCmp(other.index); 131 } 132 }; 133 134 /// The token type in the D lexer 135 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 136 137 /** 138 * Configure whitespace handling 139 */ 140 public enum WhitespaceBehavior : ubyte 141 { 142 include = 0b0000_0000, 143 skip = 0b0000_0001, 144 } 145 146 /** 147 * Configure string lexing behavior 148 */ 149 public enum StringBehavior : ubyte 150 { 151 /// Do not include quote characters, process escape sequences 152 compiler = 0b0000_0000, 153 /// Opening quotes, closing quotes, and string suffixes are included in the 154 /// string token 155 includeQuoteChars = 0b0000_0001, 156 /// String escape sequences are not replaced 157 notEscaped = 0b0000_0010, 158 /// Not modified at all. Useful for formatters or highlighters 159 source = includeQuoteChars | notEscaped 160 } 161 162 /** 163 * Lexer configuration struct 164 */ 165 public struct LexerConfig 166 { 167 string fileName; 168 StringBehavior stringBehavior; 169 WhitespaceBehavior whitespaceBehavior; 170 } 171 172 /** 173 * Returns: true if the given ID is for a basic type. 174 */ 175 public bool isBasicType(IdType type) nothrow pure @safe @nogc 176 { 177 switch (type) 178 { 179 case tok!"int": 180 case tok!"uint": 181 case tok!"double": 182 case tok!"idouble": 183 case tok!"float": 184 case tok!"ifloat": 185 case tok!"short": 186 case tok!"ushort": 187 case tok!"long": 188 case tok!"ulong": 189 case tok!"char": 190 case tok!"wchar": 191 case tok!"dchar": 192 case tok!"bool": 193 case tok!"void": 194 case tok!"cent": 195 case tok!"ucent": 196 case tok!"real": 197 case tok!"ireal": 198 case tok!"byte": 199 case tok!"ubyte": 200 case tok!"cdouble": 201 case tok!"cfloat": 202 case tok!"creal": 203 return true; 204 default: 205 return false; 206 } 207 } 208 209 /** 210 * Returns: true if the given ID type is for a number literal. 211 */ 212 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 213 { 214 switch (type) 215 { 216 case tok!"doubleLiteral": 217 case tok!"floatLiteral": 218 case tok!"idoubleLiteral": 219 case tok!"ifloatLiteral": 220 case tok!"intLiteral": 221 case tok!"longLiteral": 222 case tok!"realLiteral": 223 case tok!"irealLiteral": 224 case tok!"uintLiteral": 225 case tok!"ulongLiteral": 226 return true; 227 default: 228 return false; 229 } 230 } 231 232 /** 233 * Returns: true if the given ID type is for an operator. 234 */ 235 public bool isOperator(IdType type) nothrow pure @safe @nogc 236 { 237 switch (type) 238 { 239 case tok!",": 240 case tok!".": 241 case tok!"..": 242 case tok!"...": 243 case tok!"/": 244 case tok!"/=": 245 case tok!"!": 246 case tok!"!<": 247 case tok!"!<=": 248 case tok!"!<>": 249 case tok!"!<>=": 250 case tok!"!=": 251 case tok!"!>": 252 case tok!"!>=": 253 case tok!"$": 254 case tok!"%": 255 case tok!"%=": 256 case tok!"&": 257 case tok!"&&": 258 case tok!"&=": 259 case tok!"(": 260 case tok!")": 261 case tok!"*": 262 case tok!"*=": 263 case tok!"+": 264 case tok!"++": 265 case tok!"+=": 266 case tok!"-": 267 case tok!"--": 268 case tok!"-=": 269 case tok!":": 270 case tok!";": 271 case tok!"<": 272 case tok!"<<": 273 case tok!"<<=": 274 case tok!"<=": 275 case tok!"<>": 276 case tok!"<>=": 277 case tok!"=": 278 case tok!"==": 279 case tok!"=>": 280 case tok!">": 281 case tok!">=": 282 case tok!">>": 283 case tok!">>=": 284 case tok!">>>": 285 case tok!">>>=": 286 case tok!"?": 287 case tok!"@": 288 case tok!"[": 289 case tok!"]": 290 case tok!"^": 291 case tok!"^=": 292 case tok!"^^": 293 case tok!"^^=": 294 case tok!"{": 295 case tok!"|": 296 case tok!"|=": 297 case tok!"||": 298 case tok!"}": 299 case tok!"~": 300 case tok!"~=": 301 return true; 302 default: 303 return false; 304 } 305 } 306 307 /** 308 * Returns: true if the given ID type is for a keyword. 309 */ 310 public bool isKeyword(IdType type) pure nothrow @safe @nogc 311 { 312 switch (type) 313 { 314 case tok!"abstract": 315 case tok!"alias": 316 case tok!"align": 317 case tok!"asm": 318 case tok!"assert": 319 case tok!"auto": 320 case tok!"break": 321 case tok!"case": 322 case tok!"cast": 323 case tok!"catch": 324 case tok!"class": 325 case tok!"const": 326 case tok!"continue": 327 case tok!"debug": 328 case tok!"default": 329 case tok!"delegate": 330 case tok!"delete": 331 case tok!"deprecated": 332 case tok!"do": 333 case tok!"else": 334 case tok!"enum": 335 case tok!"export": 336 case tok!"extern": 337 case tok!"false": 338 case tok!"final": 339 case tok!"finally": 340 case tok!"for": 341 case tok!"foreach": 342 case tok!"foreach_reverse": 343 case tok!"function": 344 case tok!"goto": 345 case tok!"if": 346 case tok!"immutable": 347 case tok!"import": 348 case tok!"in": 349 case tok!"inout": 350 case tok!"interface": 351 case tok!"invariant": 352 case tok!"is": 353 case tok!"lazy": 354 case tok!"macro": 355 case tok!"mixin": 356 case tok!"module": 357 case tok!"new": 358 case tok!"nothrow": 359 case tok!"null": 360 case tok!"out": 361 case tok!"override": 362 case tok!"package": 363 case tok!"pragma": 364 case tok!"private": 365 case tok!"protected": 366 case tok!"public": 367 case tok!"pure": 368 case tok!"ref": 369 case tok!"return": 370 case tok!"scope": 371 case tok!"shared": 372 case tok!"static": 373 case tok!"struct": 374 case tok!"super": 375 case tok!"switch": 376 case tok!"synchronized": 377 case tok!"template": 378 case tok!"this": 379 case tok!"throw": 380 case tok!"true": 381 case tok!"try": 382 case tok!"typedef": 383 case tok!"typeid": 384 case tok!"typeof": 385 case tok!"union": 386 case tok!"unittest": 387 case tok!"version": 388 case tok!"volatile": 389 case tok!"while": 390 case tok!"with": 391 case tok!"__DATE__": 392 case tok!"__EOF__": 393 case tok!"__FILE__": 394 case tok!"__FUNCTION__": 395 case tok!"__gshared": 396 case tok!"__LINE__": 397 case tok!"__MODULE__": 398 case tok!"__parameters": 399 case tok!"__PRETTY_FUNCTION__": 400 case tok!"__TIME__": 401 case tok!"__TIMESTAMP__": 402 case tok!"__traits": 403 case tok!"__vector": 404 case tok!"__VENDOR__": 405 case tok!"__VERSION__": 406 return true; 407 default: 408 return false; 409 } 410 } 411 412 /** 413 * Returns: true if the given ID type is for a string literal. 414 */ 415 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 416 { 417 switch (type) 418 { 419 case tok!"dstringLiteral": 420 case tok!"stringLiteral": 421 case tok!"wstringLiteral": 422 return true; 423 default: 424 return false; 425 } 426 } 427 428 /** 429 * Returns: true if the given ID type is for a protection attribute. 430 */ 431 public bool isProtection(IdType type) pure nothrow @safe @nogc 432 { 433 switch (type) 434 { 435 case tok!"export": 436 case tok!"package": 437 case tok!"private": 438 case tok!"public": 439 case tok!"protected": 440 return true; 441 default: 442 return false; 443 } 444 } 445 446 /** 447 * Returns: an array of tokens lexed from the given source code to the output range. All 448 * whitespace tokens are skipped and comments are attached to the token nearest 449 * to them. 450 */ 451 const(Token)[] getTokensForParser(ubyte[] sourceCode, LexerConfig config, 452 StringCache* cache) 453 { 454 enum CommentType : ubyte 455 { 456 notDoc, 457 line, 458 block 459 } 460 461 static CommentType commentType(string comment) pure nothrow @safe 462 { 463 if (comment.length < 3) 464 return CommentType.notDoc; 465 if (comment[0 ..3] == "///") 466 return CommentType.line; 467 if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**") 468 return CommentType.block; 469 return CommentType.notDoc; 470 } 471 472 config.whitespaceBehavior = WhitespaceBehavior.skip; 473 474 auto output = appender!(typeof(return))(); 475 auto lexer = DLexer(sourceCode, config, cache); 476 string blockComment; 477 size_t tokenCount; 478 loop: while (!lexer.empty) switch (lexer.front.type) 479 { 480 case tok!"specialTokenSequence": 481 case tok!"whitespace": 482 lexer.popFront(); 483 break; 484 case tok!"comment": 485 final switch (commentType(lexer.front.text)) 486 { 487 case CommentType.block: 488 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 489 { 490 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text; 491 } 492 else 493 { 494 blockComment = lexer.front.text; 495 } 496 lexer.popFront(); 497 break; 498 case CommentType.line: 499 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 500 { 501 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text; 502 } 503 else 504 { 505 string c = lexer.front.text[3 .. $]; // just take the /// off entirely 506 if(blockComment.length == 0) { 507 blockComment = "/++" ~ c ~ "\n+/"; // just rewrite to this 508 } else { 509 import std.string; 510 auto l = blockComment.lastIndexOf("\n"); 511 string replacement; 512 if(l != -1) { 513 replacement = blockComment[l .. $]; 514 blockComment = blockComment[0 .. l + 1]; 515 } else { 516 replacement = blockComment[$-2 .. $]; 517 blockComment = blockComment[0 .. $-2]; // just cut off the */ or +/ 518 } 519 if(blockComment[0 .. 3] == "/**") 520 blockComment ~= c ~ replacement; 521 else if(blockComment[0 .. 3] == "/++") 522 blockComment ~= c ~ replacement; 523 else assert(0); 524 525 } 526 } 527 lexer.popFront(); 528 break; 529 case CommentType.notDoc: 530 lexer.popFront(); 531 break; 532 } 533 break; 534 case tok!"__EOF__": 535 break loop; 536 default: 537 Token t = lexer.front; 538 lexer.popFront(); 539 tokenCount++; 540 t.comment = blockComment; 541 blockComment = null; 542 output.put(t); 543 break; 544 } 545 return output.data; 546 } 547 548 /** 549 * The D lexer struct. 550 */ 551 public struct DLexer 552 { 553 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 554 keywords, pseudoTokenHandlers); 555 556 /// 557 @disable this(); 558 559 /** 560 * Params: 561 * range = the bytes that compose the source code that will be lexed. 562 * config = the lexer configuration to use. 563 * cache = the string interning cache for de-duplicating identifiers and 564 * other token text. 565 */ 566 this(ubyte[] range, const LexerConfig config, StringCache* cache, 567 bool haveSSE42 = sse42()) pure nothrow @safe 568 { 569 this.haveSSE42 = haveSSE42; 570 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 571 ? range[3 .. $] : range; 572 this.range = LexerRange(r); 573 this.config = config; 574 this.cache = cache; 575 popFront(); 576 } 577 578 /// 579 public void popFront()() pure nothrow @safe 580 { 581 do 582 _popFront(); 583 while (config.whitespaceBehavior == WhitespaceBehavior.skip 584 && _front.type == tok!"whitespace"); 585 } 586 587 private pure nothrow @safe: 588 589 bool isWhitespace() 590 { 591 switch (range.bytes[range.index]) 592 { 593 case ' ': 594 case '\r': 595 case '\n': 596 case '\t': 597 case '\v': 598 case '\f': 599 return true; 600 case 0xe2: 601 auto peek = range.peek(2); 602 return peek.length == 2 603 && peek[0] == 0x80 604 && (peek[1] == 0xa8 || peek[1] == 0xa9); 605 default: 606 return false; 607 } 608 } 609 610 void popFrontWhitespaceAware() 611 { 612 switch (range.bytes[range.index]) 613 { 614 case '\r': 615 range.popFront(); 616 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 617 { 618 range.popFront(); 619 range.incrementLine(); 620 } 621 else 622 range.incrementLine(); 623 return; 624 case '\n': 625 range.popFront(); 626 range.incrementLine(); 627 return; 628 case 0xe2: 629 auto lookahead = range.peek(3); 630 if (lookahead.length == 3 && lookahead[1] == 0x80 631 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 632 { 633 range.index+=3; 634 range.column+=3; 635 range.incrementLine(); 636 return; 637 } 638 else 639 { 640 range.popFront(); 641 return; 642 } 643 default: 644 range.popFront(); 645 return; 646 } 647 } 648 649 void lexWhitespace(ref Token token) @trusted 650 { 651 mixin (tokenStart); 652 loop: do 653 { 654 version (iasm64NotWindows) 655 { 656 if (haveSSE42 && range.index + 16 < range.bytes.length) 657 { 658 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 659 &range.index, &range.column); 660 } 661 } 662 switch (range.bytes[range.index]) 663 { 664 case '\r': 665 range.popFront(); 666 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 667 { 668 range.popFront(); 669 } 670 range.column = 1; 671 range.line += 1; 672 break; 673 case '\n': 674 range.popFront(); 675 range.column = 1; 676 range.line += 1; 677 break; 678 case ' ': 679 case '\t': 680 case '\v': 681 case '\f': 682 range.popFront(); 683 break; 684 case 0xe2: 685 if (range.index + 2 >= range.bytes.length) 686 break loop; 687 if (range.bytes[range.index + 1] != 0x80) 688 break loop; 689 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 690 { 691 range.index += 3; 692 range.column += 3; 693 range.column = 1; 694 range.line += 1; 695 break; 696 } 697 break loop; 698 default: 699 break loop; 700 } 701 } while (!(range.index >= range.bytes.length)); 702 end: 703 string text = config.whitespaceBehavior == WhitespaceBehavior.include 704 ? cache.intern(range.slice(mark)) : ""; 705 token = Token(tok!"whitespace", text, line, column, index); 706 } 707 708 void lexNumber(ref Token token) 709 { 710 mixin (tokenStart); 711 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 712 { 713 auto ahead = range.bytes[range.index + 1]; 714 switch (ahead) 715 { 716 case 'x': 717 case 'X': 718 range.index += 2; 719 range.column += 2; 720 lexHex(token, mark, line, column, index); 721 return; 722 case 'b': 723 case 'B': 724 range.index += 2; 725 range.column += 2; 726 lexBinary(token, mark, line, column, index); 727 return; 728 default: 729 lexDecimal(token, mark, line, column, index); 730 return; 731 } 732 } 733 else 734 lexDecimal(token, mark, line, column, index); 735 } 736 737 void lexHex(ref Token token) 738 { 739 mixin (tokenStart); 740 lexHex(token, mark, line, column, index); 741 } 742 743 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 744 size_t index) @trusted 745 { 746 IdType type = tok!"intLiteral"; 747 bool foundDot; 748 hexLoop: while (!(range.index >= range.bytes.length)) 749 { 750 switch (range.bytes[range.index]) 751 { 752 case 'a': .. case 'f': 753 case 'A': .. case 'F': 754 case '0': .. case '9': 755 case '_': 756 version (iasm64NotWindows) 757 { 758 if (haveSSE42 && range.index + 16 < range.bytes.length) 759 { 760 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 761 (range.bytes.ptr + range.index); 762 range.column += i; 763 range.index += i; 764 } 765 else 766 range.popFront(); 767 } 768 else 769 range.popFront(); 770 break; 771 case 'u': 772 case 'U': 773 lexIntSuffix(type); 774 break hexLoop; 775 case 'i': 776 if (foundDot) 777 lexFloatSuffix(type); 778 break hexLoop; 779 case 'L': 780 if (foundDot) 781 lexFloatSuffix(type); 782 else 783 lexIntSuffix(type); 784 break hexLoop; 785 case 'p': 786 case 'P': 787 lexExponent(type); 788 break hexLoop; 789 case '.': 790 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 791 break hexLoop; 792 else 793 { 794 // The following bit of silliness tries to tell the 795 // difference between "int dot identifier" and 796 // "double identifier". 797 if ((range.index + 1 < range.bytes.length)) 798 { 799 switch (range.peekAt(1)) 800 { 801 case '0': .. case '9': 802 case 'A': .. case 'F': 803 case 'a': .. case 'f': 804 goto doubleLiteral; 805 default: 806 break hexLoop; 807 } 808 } 809 else 810 { 811 doubleLiteral: 812 range.popFront(); 813 foundDot = true; 814 type = tok!"doubleLiteral"; 815 } 816 } 817 break; 818 default: 819 break hexLoop; 820 } 821 } 822 token = Token(type, cache.intern(range.slice(mark)), line, column, 823 index); 824 } 825 826 void lexBinary(ref Token token) 827 { 828 mixin (tokenStart); 829 return lexBinary(token, mark, line, column, index); 830 } 831 832 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 833 size_t index) @trusted 834 { 835 IdType type = tok!"intLiteral"; 836 binaryLoop: while (!(range.index >= range.bytes.length)) 837 { 838 switch (range.bytes[range.index]) 839 { 840 case '0': 841 case '1': 842 case '_': 843 version (iasm64NotWindows) 844 { 845 if (haveSSE42 && range.index + 16 < range.bytes.length) 846 { 847 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 848 range.bytes.ptr + range.index); 849 range.column += i; 850 range.index += i; 851 } 852 else 853 range.popFront(); 854 } 855 else 856 range.popFront(); 857 break; 858 case 'u': 859 case 'U': 860 case 'L': 861 lexIntSuffix(type); 862 break binaryLoop; 863 default: 864 break binaryLoop; 865 } 866 } 867 token = Token(type, cache.intern(range.slice(mark)), line, column, 868 index); 869 } 870 871 void lexDecimal(ref Token token) 872 { 873 mixin (tokenStart); 874 lexDecimal(token, mark, line, column, index); 875 } 876 877 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 878 size_t index) @trusted 879 { 880 bool foundDot = range.bytes[range.index] == '.'; 881 IdType type = tok!"intLiteral"; 882 if (foundDot) 883 { 884 range.popFront(); 885 type = tok!"doubleLiteral"; 886 } 887 888 decimalLoop: while (!(range.index >= range.bytes.length)) 889 { 890 switch (range.bytes[range.index]) 891 { 892 case '0': .. case '9': 893 case '_': 894 version (iasm64NotWindows) 895 { 896 if (haveSSE42 && range.index + 16 < range.bytes.length) 897 { 898 ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 899 range.column += i; 900 range.index += i; 901 } 902 else 903 range.popFront(); 904 } 905 else 906 range.popFront(); 907 break; 908 case 'u': 909 case 'U': 910 if (!foundDot) 911 lexIntSuffix(type); 912 break decimalLoop; 913 case 'i': 914 lexFloatSuffix(type); 915 break decimalLoop; 916 case 'L': 917 if (foundDot) 918 lexFloatSuffix(type); 919 else 920 lexIntSuffix(type); 921 break decimalLoop; 922 case 'f': 923 case 'F': 924 lexFloatSuffix(type); 925 break decimalLoop; 926 case 'e': 927 case 'E': 928 lexExponent(type); 929 break decimalLoop; 930 case '.': 931 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 932 break decimalLoop; 933 else 934 { 935 // The following bit of silliness tries to tell the 936 // difference between "int dot identifier" and 937 // "double identifier". 938 if ((range.index + 1 < range.bytes.length)) 939 { 940 auto ch = range.peekAt(1); 941 if (ch <= 0x2f 942 || (ch >= '0' && ch <= '9') 943 || (ch >= ':' && ch <= '@') 944 || (ch >= '[' && ch <= '^') 945 || (ch >= '{' && ch <= '~') 946 || ch == '`' || ch == '_') 947 { 948 goto doubleLiteral; 949 } 950 else 951 break decimalLoop; 952 } 953 else 954 { 955 doubleLiteral: 956 range.popFront(); 957 foundDot = true; 958 type = tok!"doubleLiteral"; 959 } 960 } 961 break; 962 default: 963 break decimalLoop; 964 } 965 } 966 token = Token(type, cache.intern(range.slice(mark)), line, column, 967 index); 968 } 969 970 void lexIntSuffix(ref IdType type) 971 { 972 bool secondPass; 973 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 974 { 975 U: 976 if (type == tok!"intLiteral") 977 type = tok!"uintLiteral"; 978 else 979 type = tok!"ulongLiteral"; 980 range.popFront(); 981 if (secondPass) 982 return; 983 if (range.index < range.bytes.length && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 984 goto L; 985 return; 986 } 987 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 988 { 989 L: 990 if (type == tok!"uintLiteral") 991 type = tok!"ulongLiteral"; 992 else 993 type = tok!"longLiteral"; 994 range.popFront(); 995 if (!secondPass && range.index < range.bytes.length && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 996 { 997 secondPass = true; 998 goto U; 999 } 1000 return; 1001 } 1002 } 1003 1004 void lexFloatSuffix(ref IdType type) pure nothrow @safe 1005 { 1006 switch (range.bytes[range.index]) 1007 { 1008 case 'L': 1009 range.popFront(); 1010 type = tok!"doubleLiteral"; 1011 break; 1012 case 'f': 1013 case 'F': 1014 range.popFront(); 1015 type = tok!"floatLiteral"; 1016 break; 1017 default: 1018 break; 1019 } 1020 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == 'i') 1021 { 1022 warning("Complex number literals are deprecated"); 1023 range.popFront(); 1024 if (type == tok!"floatLiteral") 1025 type = tok!"ifloatLiteral"; 1026 else 1027 type = tok!"idoubleLiteral"; 1028 } 1029 } 1030 1031 void lexExponent(ref IdType type) pure nothrow @safe 1032 { 1033 range.popFront(); 1034 bool foundSign = false; 1035 bool foundDigit = false; 1036 while (!(range.index >= range.bytes.length)) 1037 { 1038 switch (range.bytes[range.index]) 1039 { 1040 case '-': 1041 case '+': 1042 if (foundSign) 1043 { 1044 if (!foundDigit) 1045 error("Expected an exponent"); 1046 return; 1047 } 1048 foundSign = true; 1049 range.popFront(); 1050 break; 1051 case '0': .. case '9': 1052 case '_': 1053 foundDigit = true; 1054 range.popFront(); 1055 break; 1056 case 'L': 1057 case 'f': 1058 case 'F': 1059 case 'i': 1060 lexFloatSuffix(type); 1061 return; 1062 default: 1063 if (!foundDigit) 1064 error("Expected an exponent"); 1065 return; 1066 } 1067 } 1068 } 1069 1070 void lexScriptLine(ref Token token) 1071 { 1072 mixin (tokenStart); 1073 while (!(range.index >= range.bytes.length) && !isNewline) 1074 { 1075 range.popFront(); 1076 } 1077 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1078 line, column, index); 1079 } 1080 1081 void lexSpecialTokenSequence(ref Token token) 1082 { 1083 mixin (tokenStart); 1084 while (!(range.index >= range.bytes.length) && !isNewline) 1085 { 1086 range.popFront(); 1087 } 1088 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1089 line, column, index); 1090 } 1091 1092 void lexSlashStarComment(ref Token token) @trusted 1093 { 1094 mixin (tokenStart); 1095 IdType type = tok!"comment"; 1096 range.popFrontN(2); 1097 while (range.index < range.bytes.length) 1098 { 1099 version (iasm64NotWindows) 1100 { 1101 if (haveSSE42 && range.index + 16 < range.bytes.length) 1102 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1103 &range.index, &range.column); 1104 } 1105 if (range.bytes[range.index] == '*') 1106 { 1107 range.popFront(); 1108 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1109 { 1110 range.popFront(); 1111 break; 1112 } 1113 } 1114 else 1115 popFrontWhitespaceAware(); 1116 } 1117 end: 1118 token = Token(type, cache.intern(range.slice(mark)), line, column, 1119 index); 1120 } 1121 1122 void lexSlashSlashComment(ref Token token) @trusted 1123 { 1124 mixin (tokenStart); 1125 IdType type = tok!"comment"; 1126 range.popFrontN(2); 1127 while (range.index < range.bytes.length) 1128 { 1129 version (iasm64NotWindows) 1130 { 1131 if (haveSSE42 && range.index + 16 < range.bytes.length) 1132 { 1133 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1134 &range.index, &range.column); 1135 } 1136 } 1137 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1138 break; 1139 range.popFront(); 1140 } 1141 end: 1142 token = Token(type, cache.intern(range.slice(mark)), line, column, 1143 index); 1144 } 1145 1146 void lexSlashPlusComment(ref Token token) @trusted 1147 { 1148 mixin (tokenStart); 1149 IdType type = tok!"comment"; 1150 range.index += 2; 1151 range.column += 2; 1152 int depth = 1; 1153 while (depth > 0 && !(range.index >= range.bytes.length)) 1154 { 1155 version (iasm64NotWindows) 1156 { 1157 if (haveSSE42 && range.index + 16 < range.bytes.length) 1158 { 1159 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1160 &range.index, &range.column); 1161 } 1162 } 1163 if (range.bytes[range.index] == '+') 1164 { 1165 range.popFront(); 1166 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1167 { 1168 range.popFront(); 1169 depth--; 1170 } 1171 } 1172 else if (range.bytes[range.index] == '/') 1173 { 1174 range.popFront(); 1175 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1176 { 1177 range.popFront(); 1178 depth++; 1179 } 1180 } 1181 else 1182 popFrontWhitespaceAware(); 1183 } 1184 token = Token(type, cache.intern(range.slice(mark)), line, column, 1185 index); 1186 } 1187 1188 void lexStringLiteral(ref Token token) @trusted 1189 { 1190 mixin (tokenStart); 1191 range.popFront(); 1192 while (true) 1193 { 1194 if ((range.index >= range.bytes.length)) 1195 { 1196 error("Error: unterminated string literal"); 1197 token = Token(tok!""); 1198 return; 1199 } 1200 version (iasm64NotWindows) 1201 { 1202 if (haveSSE42 && range.index + 16 < range.bytes.length) 1203 { 1204 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1205 &range.index, &range.column); 1206 } 1207 } 1208 if (range.bytes[range.index] == '"') 1209 { 1210 range.popFront(); 1211 break; 1212 } 1213 else if (range.bytes[range.index] == '\\') 1214 { 1215 lexEscapeSequence(); 1216 } 1217 else 1218 popFrontWhitespaceAware(); 1219 } 1220 IdType type = tok!"stringLiteral"; 1221 lexStringSuffix(type); 1222 token = Token(type, cache.intern(range.slice(mark)), line, column, 1223 index); 1224 } 1225 1226 void lexWysiwygString(ref Token token) @trusted 1227 { 1228 mixin (tokenStart); 1229 IdType type = tok!"stringLiteral"; 1230 bool backtick = range.bytes[range.index] == '`'; 1231 if (backtick) 1232 { 1233 range.popFront(); 1234 while (true) 1235 { 1236 if ((range.index >= range.bytes.length)) 1237 { 1238 error("Error: unterminated string literal"); 1239 token = Token(tok!""); 1240 return; 1241 } 1242 version (iasm64NotWindows) 1243 { 1244 if (haveSSE42 && range.index + 16 < range.bytes.length) 1245 { 1246 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1247 &range.index, &range.column); 1248 } 1249 } 1250 if (range.bytes[range.index] == '`') 1251 { 1252 range.popFront(); 1253 break; 1254 } 1255 else 1256 popFrontWhitespaceAware(); 1257 } 1258 } 1259 else 1260 { 1261 range.popFront(); 1262 if ((range.index >= range.bytes.length)) 1263 { 1264 error("Error: unterminated string literal"); 1265 token = Token(tok!""); 1266 return; 1267 } 1268 range.popFront(); 1269 while (true) 1270 { 1271 if ((range.index >= range.bytes.length)) 1272 { 1273 error("Error: unterminated string literal"); 1274 token = Token(tok!""); 1275 return; 1276 } 1277 else if (range.bytes[range.index] == '"') 1278 { 1279 range.popFront(); 1280 break; 1281 } 1282 else 1283 popFrontWhitespaceAware(); 1284 } 1285 } 1286 lexStringSuffix(type); 1287 token = Token(type, cache.intern(range.slice(mark)), line, column, 1288 index); 1289 } 1290 1291 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1292 { 1293 if ((range.index >= range.bytes.length)) 1294 { 1295 type = tok!"stringLiteral"; 1296 return 0; 1297 } 1298 else 1299 { 1300 switch (range.bytes[range.index]) 1301 { 1302 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1303 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1304 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1305 default: type = tok!"stringLiteral"; return 0; 1306 } 1307 } 1308 } 1309 1310 void lexDelimitedString(ref Token token) 1311 { 1312 mixin (tokenStart); 1313 range.index += 2; 1314 range.column += 2; 1315 ubyte open; 1316 ubyte close; 1317 switch (range.bytes[range.index]) 1318 { 1319 case '<': 1320 open = '<'; 1321 close = '>'; 1322 range.popFront(); 1323 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1324 break; 1325 case '{': 1326 open = '{'; 1327 close = '}'; 1328 range.popFront(); 1329 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1330 break; 1331 case '[': 1332 open = '['; 1333 close = ']'; 1334 range.popFront(); 1335 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1336 break; 1337 case '(': 1338 open = '('; 1339 close = ')'; 1340 range.popFront(); 1341 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1342 break; 1343 default: 1344 lexHeredocString(token, mark, line, column, index); 1345 break; 1346 } 1347 } 1348 1349 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1350 size_t index, ubyte open, ubyte close) 1351 { 1352 int depth = 1; 1353 while (!(range.index >= range.bytes.length) && depth > 0) 1354 { 1355 if (range.bytes[range.index] == open) 1356 { 1357 depth++; 1358 range.popFront(); 1359 } 1360 else if (range.bytes[range.index] == close) 1361 { 1362 depth--; 1363 range.popFront(); 1364 if (depth <= 0) 1365 { 1366 if (range.bytes[range.index] == '"') 1367 { 1368 range.popFront(); 1369 } 1370 else 1371 { 1372 error("Error: \" expected to end delimited string literal"); 1373 token = Token(tok!""); 1374 return; 1375 } 1376 } 1377 } 1378 else 1379 popFrontWhitespaceAware(); 1380 } 1381 IdType type = tok!"stringLiteral"; 1382 lexStringSuffix(type); 1383 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1384 } 1385 1386 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1387 { 1388 Token ident; 1389 lexIdentifier(ident); 1390 if (isNewline()) 1391 popFrontWhitespaceAware(); 1392 else 1393 error("Newline expected"); 1394 while (!(range.index >= range.bytes.length)) 1395 { 1396 if (isNewline()) 1397 { 1398 popFrontWhitespaceAware(); 1399 if (!range.canPeek(ident.text.length)) 1400 { 1401 error(ident.text ~ " expected"); 1402 break; 1403 } 1404 if (range.peek(ident.text.length - 1) == ident.text) 1405 { 1406 range.popFrontN(ident.text.length); 1407 break; 1408 } 1409 } 1410 else 1411 { 1412 range.popFront(); 1413 } 1414 } 1415 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1416 { 1417 range.popFront(); 1418 } 1419 else 1420 error(`" expected`); 1421 IdType type = tok!"stringLiteral"; 1422 lexStringSuffix(type); 1423 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1424 } 1425 1426 void lexTokenString(ref Token token) 1427 { 1428 mixin (tokenStart); 1429 assert (range.bytes[range.index] == 'q'); 1430 range.popFront(); 1431 assert (range.bytes[range.index] == '{'); 1432 range.popFront(); 1433 auto app = appender!string(); 1434 app.put("q{"); 1435 int depth = 1; 1436 1437 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1438 immutable StringBehavior oldString = config.stringBehavior; 1439 config.whitespaceBehavior = WhitespaceBehavior.include; 1440 config.stringBehavior = StringBehavior.source; 1441 scope (exit) 1442 { 1443 config.whitespaceBehavior = oldWhitespace; 1444 config.stringBehavior = oldString; 1445 } 1446 1447 advance(_front); 1448 while (depth > 0 && !empty) 1449 { 1450 auto t = front(); 1451 if (t.text is null) 1452 app.put(str(t.type)); 1453 else 1454 app.put(t.text); 1455 if (t.type == tok!"}") 1456 { 1457 depth--; 1458 if (depth > 0) 1459 popFront(); 1460 } 1461 else if (t.type == tok!"{") 1462 { 1463 depth++; 1464 popFront(); 1465 } 1466 else 1467 popFront(); 1468 } 1469 IdType type = tok!"stringLiteral"; 1470 auto b = lexStringSuffix(type); 1471 if (b != 0) 1472 app.put(b); 1473 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1474 column, index); 1475 } 1476 1477 void lexHexString(ref Token token) 1478 { 1479 mixin (tokenStart); 1480 range.index += 2; 1481 range.column += 2; 1482 1483 loop: while (true) 1484 { 1485 if ((range.index >= range.bytes.length)) 1486 { 1487 error("Error: unterminated hex string literal"); 1488 token = Token(tok!""); 1489 return; 1490 } 1491 else if (isWhitespace()) 1492 popFrontWhitespaceAware(); 1493 else switch (range.bytes[range.index]) 1494 { 1495 case '0': .. case '9': 1496 case 'A': .. case 'F': 1497 case 'a': .. case 'f': 1498 range.popFront(); 1499 break; 1500 case '"': 1501 range.popFront(); 1502 break loop; 1503 default: 1504 error("Error: invalid character in hex string"); 1505 token = Token(tok!""); 1506 return; 1507 } 1508 } 1509 1510 IdType type = tok!"stringLiteral"; 1511 lexStringSuffix(type); 1512 token = Token(type, cache.intern(range.slice(mark)), line, column, 1513 index); 1514 } 1515 1516 bool lexEscapeSequence() 1517 { 1518 range.popFront(); 1519 if ((range.index >= range.bytes.length)) 1520 { 1521 error("Error: non-terminated character escape sequence."); 1522 return false; 1523 } 1524 switch (range.bytes[range.index]) 1525 { 1526 case '\'': 1527 case '"': 1528 case '?': 1529 case '\\': 1530 case 'a': 1531 case 'b': 1532 case 'f': 1533 case 'n': 1534 case 'r': 1535 case 't': 1536 case 'v': 1537 range.popFront(); 1538 break; 1539 case 'x': 1540 range.popFront(); 1541 foreach (i; 0 .. 2) 1542 { 1543 if ((range.index >= range.bytes.length)) 1544 { 1545 error("Error: 2 hex digits expected."); 1546 return false; 1547 } 1548 switch (range.bytes[range.index]) 1549 { 1550 case '0': .. case '9': 1551 case 'a': .. case 'f': 1552 case 'A': .. case 'F': 1553 range.popFront(); 1554 break; 1555 default: 1556 error("Error: 2 hex digits expected."); 1557 return false; 1558 } 1559 } 1560 break; 1561 case '0': 1562 if (!(range.index + 1 < range.bytes.length) || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1563 { 1564 range.popFront(); 1565 break; 1566 } 1567 goto case; 1568 case '1': .. case '7': 1569 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1570 range.popFront(); 1571 break; 1572 case 'u': 1573 range.popFront(); 1574 foreach (i; 0 .. 4) 1575 { 1576 if ((range.index >= range.bytes.length)) 1577 { 1578 error("Error: at least 4 hex digits expected."); 1579 return false; 1580 } 1581 switch (range.bytes[range.index]) 1582 { 1583 case '0': .. case '9': 1584 case 'a': .. case 'f': 1585 case 'A': .. case 'F': 1586 range.popFront(); 1587 break; 1588 default: 1589 error("Error: at least 4 hex digits expected."); 1590 return false; 1591 } 1592 } 1593 break; 1594 case 'U': 1595 range.popFront(); 1596 foreach (i; 0 .. 8) 1597 { 1598 if ((range.index >= range.bytes.length)) 1599 { 1600 error("Error: at least 8 hex digits expected."); 1601 return false; 1602 } 1603 switch (range.bytes[range.index]) 1604 { 1605 case '0': .. case '9': 1606 case 'a': .. case 'f': 1607 case 'A': .. case 'F': 1608 range.popFront(); 1609 break; 1610 default: 1611 error("Error: at least 8 hex digits expected."); 1612 return false; 1613 } 1614 } 1615 break; 1616 default: 1617 while (true) 1618 { 1619 if ((range.index >= range.bytes.length)) 1620 { 1621 error("Error: non-terminated character escape sequence."); 1622 return false; 1623 } 1624 if (range.bytes[range.index] == ';') 1625 { 1626 range.popFront(); 1627 break; 1628 } 1629 else 1630 { 1631 range.popFront(); 1632 } 1633 } 1634 } 1635 return true; 1636 } 1637 1638 void lexCharacterLiteral(ref Token token) 1639 { 1640 mixin (tokenStart); 1641 range.popFront(); 1642 if (range.bytes[range.index] == '\\') 1643 { 1644 lexEscapeSequence(); 1645 goto close; 1646 } 1647 else if (range.bytes[range.index] == '\'') 1648 { 1649 range.popFront(); 1650 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1651 line, column, index); 1652 } 1653 else if (range.bytes[range.index] & 0x80) 1654 { 1655 while (range.bytes[range.index] & 0x80) 1656 { 1657 range.popFront(); 1658 } 1659 goto close; 1660 } 1661 else 1662 { 1663 popFrontWhitespaceAware(); 1664 goto close; 1665 } 1666 close: 1667 if (range.index < range.bytes.length && range.bytes[range.index] == '\'') 1668 { 1669 range.popFront(); 1670 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1671 line, column, index); 1672 } 1673 else 1674 { 1675 error("Error: Expected ' to end character literal"); 1676 token = Token(tok!""); 1677 } 1678 } 1679 1680 void lexIdentifier(ref Token token) @trusted 1681 { 1682 mixin (tokenStart); 1683 if (isSeparating(0)) 1684 { 1685 error("Invalid identifier"); 1686 range.popFront(); 1687 } 1688 while (true) 1689 { 1690 version (iasm64NotWindows) 1691 { 1692 if (haveSSE42 && range.index + 16 < range.bytes.length) 1693 { 1694 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1695 (range.bytes.ptr + range.index); 1696 range.column += i; 1697 range.index += i; 1698 } 1699 } 1700 if (isSeparating(0)) 1701 break; 1702 else 1703 range.popFront(); 1704 } 1705 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1706 column, index); 1707 } 1708 1709 void lexDot(ref Token token) 1710 { 1711 mixin (tokenStart); 1712 if (!(range.index + 1 < range.bytes.length)) 1713 { 1714 range.popFront(); 1715 token = Token(tok!".", null, line, column, index); 1716 return; 1717 } 1718 switch (range.peekAt(1)) 1719 { 1720 case '0': .. case '9': 1721 lexNumber(token); 1722 return; 1723 case '.': 1724 range.popFront(); 1725 range.popFront(); 1726 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1727 { 1728 range.popFront(); 1729 token = Token(tok!"...", null, line, column, index); 1730 } 1731 else 1732 token = Token(tok!"..", null, line, column, index); 1733 return; 1734 default: 1735 range.popFront(); 1736 token = Token(tok!".", null, line, column, index); 1737 return; 1738 } 1739 } 1740 1741 void lexLongNewline(ref Token token) @nogc 1742 { 1743 mixin (tokenStart); 1744 range.popFront(); 1745 range.popFront(); 1746 range.popFront(); 1747 range.incrementLine(); 1748 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1749 ? cache.intern(range.slice(mark)) : ""; 1750 token = Token(tok!"whitespace", text, line, 1751 column, index); 1752 } 1753 1754 bool isNewline() @nogc 1755 { 1756 if (range.bytes[range.index] == '\n') return true; 1757 if (range.bytes[range.index] == '\r') return true; 1758 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1759 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1760 } 1761 1762 bool isSeparating(size_t offset) @nogc 1763 { 1764 enum : ubyte 1765 { 1766 n, y, m // no, yes, maybe 1767 } 1768 1769 if (range.index + offset >= range.bytes.length) 1770 return true; 1771 auto c = range.bytes[range.index + offset]; 1772 static immutable ubyte[256] LOOKUP_TABLE = [ 1773 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1774 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1775 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1776 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1777 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1778 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1779 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1780 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1781 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1782 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1783 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1784 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1785 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1786 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1787 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1788 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1789 ]; 1790 immutable ubyte result = LOOKUP_TABLE[c]; 1791 if (result == n) 1792 return false; 1793 if (result == y) 1794 return true; 1795 if (result == m) 1796 { 1797 auto r = range; 1798 range.popFrontN(offset); 1799 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1800 || r.peek(2) == "\u2029")); 1801 } 1802 assert (false); 1803 } 1804 1805 1806 1807 enum tokenStart = q{ 1808 size_t index = range.index; 1809 size_t column = range.column; 1810 size_t line = range.line; 1811 auto mark = range.mark(); 1812 }; 1813 1814 void error(string message) 1815 { 1816 messages ~= Message(range.line, range.column, message, true); 1817 } 1818 1819 void warning(string message) 1820 { 1821 messages ~= Message(range.line, range.column, message, false); 1822 assert (messages.length > 0); 1823 } 1824 1825 static struct Message 1826 { 1827 size_t line; 1828 size_t column; 1829 string message; 1830 bool isError; 1831 } 1832 1833 Message[] messages; 1834 StringCache* cache; 1835 LexerConfig config; 1836 bool haveSSE42; 1837 } 1838 1839 /** 1840 * Creates a token range from the given source code. Creates a default lexer 1841 * configuration and a GC-managed string cache. 1842 */ 1843 public auto byToken(ubyte[] range) 1844 { 1845 LexerConfig config; 1846 StringCache* cache = new StringCache(StringCache.defaultBucketCount); 1847 return DLexer(range, config, cache); 1848 } 1849 1850 /** 1851 * Creates a token range from the given source code. Uses the given string 1852 * cache. 1853 */ 1854 public auto byToken(ubyte[] range, StringCache* cache) 1855 { 1856 LexerConfig config; 1857 return DLexer(range, config, cache); 1858 } 1859 1860 /** 1861 * Creates a token range from the given source code. Uses the provided lexer 1862 * configuration and string cache. 1863 */ 1864 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache) 1865 { 1866 return DLexer(range, config, cache); 1867 } 1868 1869 /** 1870 * Removes "decoration" such as leading whitespace, leading + and * characters, 1871 * and places the result into the given output range 1872 */ 1873 public void unDecorateComment(T)(string comment, auto ref T outputRange) 1874 if (isOutputRange!(T, string)) 1875 in 1876 { 1877 assert (comment.length >= 3); 1878 } 1879 do 1880 { 1881 switch (comment[0 .. 3]) 1882 { 1883 case "///": 1884 size_t i = 3; 1885 if (i < comment.length) 1886 { 1887 again: 1888 while (i < comment.length && (comment[i] == ' ' || comment[i] == '\t')) 1889 i++; 1890 size_t j = i + 1; 1891 while (j < comment.length) 1892 { 1893 if (comment[j] == '\r') 1894 j++; 1895 if (j >= comment.length) 1896 break; 1897 if (comment[j] == '\n') 1898 { 1899 outputRange.put(comment[i .. j]); 1900 j++; 1901 while (j < comment.length && comment[j] == '/') 1902 j++; 1903 outputRange.put('\n'); 1904 i = j; 1905 goto again; 1906 } 1907 j++; 1908 } 1909 if (i < comment.length && j <= comment.length) 1910 outputRange.put(comment[i .. j]); 1911 } 1912 break; 1913 case "/++": 1914 case "/**": 1915 size_t i = 3; 1916 immutable char c = comment[1]; 1917 // Skip leading * and + characters 1918 while (comment[i] == c) i++; 1919 // Skip trailing * and + characters 1920 size_t j = comment.length - 2; 1921 while (j > i && comment[j] == c) 1922 j--; 1923 while (j > i && (comment[j] == ' ' || comment[j] == '\t')) 1924 j--; 1925 j++; 1926 size_t k = i; 1927 while (k < j) 1928 { 1929 if (comment[k] == '\n') 1930 { 1931 k++; 1932 break; 1933 } 1934 k++; 1935 } 1936 outputRange.put(comment[i .. k]); 1937 i = k; 1938 if (comment[i] == '\r') i++; 1939 if (comment[i] == '\n') i++; 1940 while (comment[i] == ' ' || comment[i] == '\t') i++; 1941 immutable bool skipBeginningChar = comment[i] == c; 1942 if (skipBeginningChar) 1943 i++; 1944 size_t whitespaceToSkip; 1945 while (comment[i] == ' ' || comment[i] == '\t') 1946 { 1947 whitespaceToSkip++; 1948 i++; 1949 } 1950 size_t l = i; 1951 while (i < j) 1952 { 1953 if (comment[i++] == '\n') 1954 break; 1955 } 1956 outputRange.put(comment[l .. i]); 1957 while (true) 1958 { 1959 if (skipBeginningChar) 1960 { 1961 while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++; 1962 if (i < j && comment[i] == c) i++; 1963 } 1964 for (size_t s = 0; (i < j) && (s < whitespaceToSkip) 1965 && (comment[i] == ' ' || comment[i] == '\t');) 1966 { 1967 s++; 1968 i++; 1969 } 1970 k = i; 1971 inner: while (k < j) 1972 { 1973 if (comment[k] == '\n') 1974 { 1975 k++; 1976 break inner; 1977 } 1978 k++; 1979 } 1980 outputRange.put(comment[i .. k]); 1981 i = k; 1982 if (i >= j) 1983 break; 1984 } 1985 break; 1986 default: 1987 outputRange.put(comment); 1988 break; 1989 } 1990 } 1991 1992 1993 /** 1994 * The string cache is used for string interning. 1995 * 1996 * It will only store a single copy of any string that it is asked to hold. 1997 * Interned strings can be compared for equality by comparing their $(B .ptr) 1998 * field. 1999 * 2000 * Default and postbilt constructors are disabled. When a StringCache goes out 2001 * of scope, the memory held by it is freed. 2002 * 2003 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 2004 */ 2005 struct StringCache 2006 { 2007 public pure nothrow @nogc: 2008 2009 @disable this(); 2010 @disable this(this); 2011 2012 /** 2013 * Params: bucketCount = the initial number of buckets. Must be a 2014 * power of two 2015 */ 2016 this(size_t bucketCount) nothrow @trusted @nogc 2017 in 2018 { 2019 import core.bitop : popcnt; 2020 static if (size_t.sizeof == 8) 2021 { 2022 immutable low = popcnt(cast(uint) bucketCount); 2023 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 2024 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 2025 } 2026 else 2027 { 2028 static assert (size_t.sizeof == 4); 2029 assert (popcnt(cast(uint) bucketCount) == 1); 2030 } 2031 } 2032 do 2033 { 2034 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 2035 } 2036 2037 void freeItAll() 2038 { 2039 Block* current = rootBlock; 2040 while (current !is null) 2041 { 2042 Block* prev = current; 2043 current = current.next; 2044 free(cast(void*) prev); 2045 } 2046 foreach (nodePointer; buckets) 2047 { 2048 Node* currentNode = nodePointer; 2049 while (currentNode !is null) 2050 { 2051 if (currentNode.mallocated) 2052 free(currentNode.str.ptr); 2053 Node* prev = currentNode; 2054 currentNode = currentNode.next; 2055 free(prev); 2056 } 2057 } 2058 rootBlock = null; 2059 free(buckets.ptr); 2060 buckets = null; 2061 } 2062 2063 /** 2064 * Caches a string. 2065 */ 2066 string intern(const(ubyte)[] str) @safe 2067 { 2068 if (str is null || str.length == 0) 2069 return ""; 2070 return _intern(str); 2071 } 2072 2073 /** 2074 * ditto 2075 */ 2076 string intern(string str) @trusted 2077 { 2078 return intern(cast(ubyte[]) str); 2079 } 2080 2081 /** 2082 * The default bucket count for the string cache. 2083 */ 2084 static enum defaultBucketCount = 4096; 2085 2086 private: 2087 2088 string _intern(const(ubyte)[] bytes) @trusted 2089 { 2090 immutable uint hash = hashBytes(bytes); 2091 immutable size_t index = hash & (buckets.length - 1); 2092 Node* s = find(bytes, hash); 2093 if (s !is null) 2094 return cast(string) s.str; 2095 ubyte[] mem = void; 2096 bool mallocated = bytes.length > BIG_STRING; 2097 if (mallocated) 2098 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2099 else 2100 mem = allocate(bytes.length); 2101 mem[] = bytes[]; 2102 Node* node = cast(Node*) malloc(Node.sizeof); 2103 node.str = mem; 2104 node.hash = hash; 2105 node.next = buckets[index]; 2106 node.mallocated = mallocated; 2107 buckets[index] = node; 2108 return cast(string) mem; 2109 } 2110 2111 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2112 { 2113 import std.algorithm : equal; 2114 immutable size_t index = hash & (buckets.length - 1); 2115 Node* node = buckets[index]; 2116 while (node !is null) 2117 { 2118 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2119 return node; 2120 node = node.next; 2121 } 2122 return node; 2123 } 2124 2125 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2126 in 2127 { 2128 assert (data !is null); 2129 assert (data.length > 0); 2130 } 2131 do 2132 { 2133 immutable uint m = 0x5bd1e995; 2134 immutable int r = 24; 2135 uint h = cast(uint) data.length; 2136 while (data.length >= 4) 2137 { 2138 uint k = (cast(ubyte) data[3]) << 24 2139 | (cast(ubyte) data[2]) << 16 2140 | (cast(ubyte) data[1]) << 8 2141 | (cast(ubyte) data[0]); 2142 k *= m; 2143 k ^= k >> r; 2144 k *= m; 2145 h *= m; 2146 h ^= k; 2147 data = data[4 .. $]; 2148 } 2149 switch (data.length & 3) 2150 { 2151 case 3: 2152 h ^= data[2] << 16; 2153 goto case; 2154 case 2: 2155 h ^= data[1] << 8; 2156 goto case; 2157 case 1: 2158 h ^= data[0]; 2159 h *= m; 2160 break; 2161 default: 2162 break; 2163 } 2164 h ^= h >> 13; 2165 h *= m; 2166 h ^= h >> 15; 2167 return h; 2168 } 2169 2170 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2171 in 2172 { 2173 assert (numBytes != 0); 2174 } 2175 out (result) 2176 { 2177 assert (result.length == numBytes); 2178 } 2179 do 2180 { 2181 Block* r = rootBlock; 2182 size_t i = 0; 2183 while (i <= 3 && r !is null) 2184 { 2185 immutable size_t available = r.bytes.length; 2186 immutable size_t oldUsed = r.used; 2187 immutable size_t newUsed = oldUsed + numBytes; 2188 if (newUsed <= available) 2189 { 2190 r.used = newUsed; 2191 return r.bytes[oldUsed .. newUsed]; 2192 } 2193 i++; 2194 r = r.next; 2195 } 2196 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2197 b.used = numBytes; 2198 b.next = rootBlock; 2199 rootBlock = b; 2200 return b.bytes[0 .. numBytes]; 2201 } 2202 2203 static struct Node 2204 { 2205 ubyte[] str = void; 2206 Node* next = void; 2207 uint hash = void; 2208 bool mallocated = void; 2209 } 2210 2211 static struct Block 2212 { 2213 Block* next; 2214 size_t used; 2215 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2216 ubyte[BLOCK_CAPACITY] bytes; 2217 } 2218 2219 static assert (BLOCK_SIZE == Block.sizeof); 2220 2221 enum BLOCK_SIZE = 1024 * 16; 2222 2223 // If a string would take up more than 1/4 of a block, allocate it outside 2224 // of the block. 2225 enum BIG_STRING = BLOCK_SIZE / 4; 2226 2227 Node*[] buckets; 2228 Block* rootBlock; 2229 } 2230 2231 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2232 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2233 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2234 2235 unittest 2236 { 2237 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2238 auto tokens = getTokensForParser(source, LexerConfig(), 2239 new StringCache(StringCache.defaultBucketCount)); 2240 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2241 tok!"identifier", tok!";"])); 2242 } 2243 2244 /// Test \x char sequence 2245 unittest 2246 { 2247 auto toks = (string s) => byToken(cast(ubyte[])s); 2248 2249 // valid 2250 enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2251 auto source = ""; 2252 foreach (h1; hex) 2253 foreach (h2; hex) 2254 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2255 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2256 2257 // invalid 2258 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2259 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2260 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2261 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2262 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2263 } 2264 2265 version (iasm64NotWindows) 2266 { 2267 /** 2268 * Returns: 2269 */ 2270 ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc 2271 { 2272 asm pure nothrow @nogc 2273 { 2274 naked; 2275 movdqu XMM1, [RDI]; 2276 mov RAX, 3; 2277 mov RDX, 16; 2278 mov R8, 0x0d0d0d0d0d0d0d0dL; 2279 movq XMM2, R8; 2280 shufpd XMM2, XMM2, 0; 2281 pcmpeqb XMM2, XMM1; 2282 mov R9, 0x0a0a0a0a0a0a0a0aL; 2283 movq XMM3, R9; 2284 shufpd XMM3, XMM3, 0; 2285 pcmpeqb XMM3, XMM1; 2286 mov R10, 0xe280a8L; 2287 movq XMM4, R10; 2288 pcmpestrm XMM4, XMM1, 0b01001100; 2289 movdqa XMM4, XMM0; 2290 mov R11, 0xe280a9L; 2291 movq XMM5, R11; 2292 pcmpestrm XMM5, XMM1, 0b01001100; 2293 movdqa XMM5, XMM0; 2294 mov RCX, 0x0a0d; 2295 dec RAX; 2296 movq XMM6, RCX; 2297 pcmpestrm XMM6, XMM1, 0b01001100; 2298 movdqa XMM6, XMM0; 2299 movdqa XMM7, XMM6; 2300 pslldq XMM7, 1; 2301 movdqa XMM0, XMM4; 2302 por XMM0, XMM5; 2303 por XMM7, XMM6; 2304 movdqa XMM1, XMM2; 2305 por XMM1, XMM3; 2306 pxor XMM7, XMM1; 2307 por XMM7, XMM0; 2308 por XMM7, XMM6; 2309 pmovmskb RAX, XMM7; 2310 and RAX, 0b0011_1111_1111_1111; 2311 ret; 2312 } 2313 } 2314 2315 /** 2316 * Skips between 0 and 16 bytes that match (or do not match) one of the 2317 * given $(B chars). 2318 */ 2319 void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow 2320 @trusted @nogc if (chars.length <= 8) 2321 { 2322 enum constant = ByteCombine!chars; 2323 enum charsLength = chars.length; 2324 static if (matching) 2325 enum flags = 0b0001_0000; 2326 else 2327 enum flags = 0b0000_0000; 2328 asm pure nothrow @nogc 2329 { 2330 naked; 2331 movdqu XMM1, [RDX]; 2332 mov R10, constant; 2333 movq XMM2, R10; 2334 mov RAX, charsLength; 2335 mov RDX, 16; 2336 pcmpestri XMM2, XMM1, flags; 2337 add [RSI], RCX; 2338 add [RDI], RCX; 2339 ret; 2340 } 2341 } 2342 2343 /** 2344 * Returns: the number of bytes starting at the given location that match 2345 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2346 */ 2347 ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc 2348 { 2349 static assert (chars.length % 2 == 0); 2350 enum constant = ByteCombine!chars; 2351 static if (invert) 2352 enum rangeMatchFlags = 0b0000_0100; 2353 else 2354 enum rangeMatchFlags = 0b0001_0100; 2355 enum charsLength = chars.length; 2356 asm pure nothrow @nogc 2357 { 2358 naked; 2359 movdqu XMM1, [RDI]; 2360 mov R10, constant; 2361 movq XMM2, R10; 2362 mov RAX, charsLength; 2363 mov RDX, 16; 2364 pcmpestri XMM2, XMM1, rangeMatchFlags; 2365 mov RAX, RCX; 2366 ret; 2367 } 2368 } 2369 2370 template ByteCombine(c...) 2371 { 2372 static assert (c.length <= 8); 2373 static if (c.length > 1) 2374 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2375 else 2376 enum ulong ByteCombine = c[0]; 2377 } 2378 }