1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import core.cpuid : sse42; 10 version (D_InlineAsm_X86_64) 11 { 12 version (Windows) {} 13 else version = iasm64NotWindows; 14 } 15 16 /// Operators 17 private enum operators = [ 18 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 19 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 20 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 21 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 22 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 23 ]; 24 25 /// Kewords 26 private enum keywords = [ 27 "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool", 28 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 29 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 30 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 31 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 32 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 33 "immutable", "import", "in", "inout", "int", "interface", "invariant", 34 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 35 "null", "out", "override", "package", "pragma", "private", "protected", 36 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 37 "static", "struct", "super", "switch", "synchronized", "template", "this", 38 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 39 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 40 "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 41 "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters", 42 "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector", 43 "__VENDOR__", "__VERSION__" 44 ]; 45 46 /// Other tokens 47 private enum dynamicTokens = [ 48 "specialTokenSequence", "comment", "identifier", "scriptLine", 49 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 50 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 51 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 52 "dstringLiteral", "stringLiteral", "wstringLiteral" 53 ]; 54 55 private enum pseudoTokenHandlers = [ 56 "\"", "lexStringLiteral", 57 "`", "lexWysiwygString", 58 "//", "lexSlashSlashComment", 59 "/*", "lexSlashStarComment", 60 "/+", "lexSlashPlusComment", 61 ".", "lexDot", 62 "'", "lexCharacterLiteral", 63 "0", "lexNumber", 64 "1", "lexDecimal", 65 "2", "lexDecimal", 66 "3", "lexDecimal", 67 "4", "lexDecimal", 68 "5", "lexDecimal", 69 "6", "lexDecimal", 70 "7", "lexDecimal", 71 "8", "lexDecimal", 72 "9", "lexDecimal", 73 "q\"", "lexDelimitedString", 74 "q{", "lexTokenString", 75 "r\"", "lexWysiwygString", 76 "x\"", "lexHexString", 77 " ", "lexWhitespace", 78 "\t", "lexWhitespace", 79 "\r", "lexWhitespace", 80 "\n", "lexWhitespace", 81 "\v", "lexWhitespace", 82 "\f", "lexWhitespace", 83 "\u2028", "lexLongNewline", 84 "\u2029", "lexLongNewline", 85 "#!", "lexScriptLine", 86 "#line", "lexSpecialTokenSequence" 87 ]; 88 89 /// Token ID type for the D lexer. 90 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 91 92 /** 93 * Function used for converting an IdType to a string. 94 * 95 * Examples: 96 * --- 97 * IdType c = tok!"case"; 98 * assert (str(c) == "case"); 99 * --- 100 */ 101 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 102 103 /** 104 * Template used to refer to D token types. 105 * 106 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 107 * values that can be passed to this template. 108 * Example: 109 * --- 110 * import dparse.lexer; 111 * IdType t = tok!"floatLiteral"; 112 * --- 113 */ 114 public template tok(string token) 115 { 116 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 117 } 118 119 private enum extraFields = q{ 120 string comment; 121 string trailingComment; 122 123 int opCmp(size_t i) const pure nothrow @safe { 124 if (index < i) return -1; 125 if (index > i) return 1; 126 return 0; 127 } 128 129 int opCmp(ref const typeof(this) other) const pure nothrow @safe { 130 return opCmp(other.index); 131 } 132 }; 133 134 /// The token type in the D lexer 135 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 136 137 /** 138 * Configure whitespace handling 139 */ 140 public enum WhitespaceBehavior : ubyte 141 { 142 include = 0b0000_0000, 143 skip = 0b0000_0001, 144 } 145 146 /** 147 * Configure string lexing behavior 148 */ 149 public enum StringBehavior : ubyte 150 { 151 /// Do not include quote characters, process escape sequences 152 compiler = 0b0000_0000, 153 /// Opening quotes, closing quotes, and string suffixes are included in the 154 /// string token 155 includeQuoteChars = 0b0000_0001, 156 /// String escape sequences are not replaced 157 notEscaped = 0b0000_0010, 158 /// Not modified at all. Useful for formatters or highlighters 159 source = includeQuoteChars | notEscaped 160 } 161 162 /** 163 * Lexer configuration struct 164 */ 165 public struct LexerConfig 166 { 167 string fileName; 168 StringBehavior stringBehavior; 169 WhitespaceBehavior whitespaceBehavior; 170 } 171 172 /** 173 * Returns: true if the given ID is for a basic type. 174 */ 175 public bool isBasicType(IdType type) nothrow pure @safe @nogc 176 { 177 switch (type) 178 { 179 case tok!"int": 180 case tok!"uint": 181 case tok!"double": 182 case tok!"idouble": 183 case tok!"float": 184 case tok!"ifloat": 185 case tok!"short": 186 case tok!"ushort": 187 case tok!"long": 188 case tok!"ulong": 189 case tok!"char": 190 case tok!"wchar": 191 case tok!"dchar": 192 case tok!"bool": 193 case tok!"void": 194 case tok!"cent": 195 case tok!"ucent": 196 case tok!"real": 197 case tok!"ireal": 198 case tok!"byte": 199 case tok!"ubyte": 200 case tok!"cdouble": 201 case tok!"cfloat": 202 case tok!"creal": 203 return true; 204 default: 205 return false; 206 } 207 } 208 209 /** 210 * Returns: true if the given ID type is for a number literal. 211 */ 212 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 213 { 214 switch (type) 215 { 216 case tok!"doubleLiteral": 217 case tok!"floatLiteral": 218 case tok!"idoubleLiteral": 219 case tok!"ifloatLiteral": 220 case tok!"intLiteral": 221 case tok!"longLiteral": 222 case tok!"realLiteral": 223 case tok!"irealLiteral": 224 case tok!"uintLiteral": 225 case tok!"ulongLiteral": 226 return true; 227 default: 228 return false; 229 } 230 } 231 232 /** 233 * Returns: true if the given ID type is for an operator. 234 */ 235 public bool isOperator(IdType type) nothrow pure @safe @nogc 236 { 237 switch (type) 238 { 239 case tok!",": 240 case tok!".": 241 case tok!"..": 242 case tok!"...": 243 case tok!"/": 244 case tok!"/=": 245 case tok!"!": 246 case tok!"!<": 247 case tok!"!<=": 248 case tok!"!<>": 249 case tok!"!<>=": 250 case tok!"!=": 251 case tok!"!>": 252 case tok!"!>=": 253 case tok!"$": 254 case tok!"%": 255 case tok!"%=": 256 case tok!"&": 257 case tok!"&&": 258 case tok!"&=": 259 case tok!"(": 260 case tok!")": 261 case tok!"*": 262 case tok!"*=": 263 case tok!"+": 264 case tok!"++": 265 case tok!"+=": 266 case tok!"-": 267 case tok!"--": 268 case tok!"-=": 269 case tok!":": 270 case tok!";": 271 case tok!"<": 272 case tok!"<<": 273 case tok!"<<=": 274 case tok!"<=": 275 case tok!"<>": 276 case tok!"<>=": 277 case tok!"=": 278 case tok!"==": 279 case tok!"=>": 280 case tok!">": 281 case tok!">=": 282 case tok!">>": 283 case tok!">>=": 284 case tok!">>>": 285 case tok!">>>=": 286 case tok!"?": 287 case tok!"@": 288 case tok!"[": 289 case tok!"]": 290 case tok!"^": 291 case tok!"^=": 292 case tok!"^^": 293 case tok!"^^=": 294 case tok!"{": 295 case tok!"|": 296 case tok!"|=": 297 case tok!"||": 298 case tok!"}": 299 case tok!"~": 300 case tok!"~=": 301 return true; 302 default: 303 return false; 304 } 305 } 306 307 /** 308 * Returns: true if the given ID type is for a keyword. 309 */ 310 public bool isKeyword(IdType type) pure nothrow @safe @nogc 311 { 312 switch (type) 313 { 314 case tok!"abstract": 315 case tok!"alias": 316 case tok!"align": 317 case tok!"asm": 318 case tok!"assert": 319 case tok!"auto": 320 case tok!"body": 321 case tok!"break": 322 case tok!"case": 323 case tok!"cast": 324 case tok!"catch": 325 case tok!"class": 326 case tok!"const": 327 case tok!"continue": 328 case tok!"debug": 329 case tok!"default": 330 case tok!"delegate": 331 case tok!"delete": 332 case tok!"deprecated": 333 case tok!"do": 334 case tok!"else": 335 case tok!"enum": 336 case tok!"export": 337 case tok!"extern": 338 case tok!"false": 339 case tok!"final": 340 case tok!"finally": 341 case tok!"for": 342 case tok!"foreach": 343 case tok!"foreach_reverse": 344 case tok!"function": 345 case tok!"goto": 346 case tok!"if": 347 case tok!"immutable": 348 case tok!"import": 349 case tok!"in": 350 case tok!"inout": 351 case tok!"interface": 352 case tok!"invariant": 353 case tok!"is": 354 case tok!"lazy": 355 case tok!"macro": 356 case tok!"mixin": 357 case tok!"module": 358 case tok!"new": 359 case tok!"nothrow": 360 case tok!"null": 361 case tok!"out": 362 case tok!"override": 363 case tok!"package": 364 case tok!"pragma": 365 case tok!"private": 366 case tok!"protected": 367 case tok!"public": 368 case tok!"pure": 369 case tok!"ref": 370 case tok!"return": 371 case tok!"scope": 372 case tok!"shared": 373 case tok!"static": 374 case tok!"struct": 375 case tok!"super": 376 case tok!"switch": 377 case tok!"synchronized": 378 case tok!"template": 379 case tok!"this": 380 case tok!"throw": 381 case tok!"true": 382 case tok!"try": 383 case tok!"typedef": 384 case tok!"typeid": 385 case tok!"typeof": 386 case tok!"union": 387 case tok!"unittest": 388 case tok!"version": 389 case tok!"volatile": 390 case tok!"while": 391 case tok!"with": 392 case tok!"__DATE__": 393 case tok!"__EOF__": 394 case tok!"__FILE__": 395 case tok!"__FUNCTION__": 396 case tok!"__gshared": 397 case tok!"__LINE__": 398 case tok!"__MODULE__": 399 case tok!"__parameters": 400 case tok!"__PRETTY_FUNCTION__": 401 case tok!"__TIME__": 402 case tok!"__TIMESTAMP__": 403 case tok!"__traits": 404 case tok!"__vector": 405 case tok!"__VENDOR__": 406 case tok!"__VERSION__": 407 return true; 408 default: 409 return false; 410 } 411 } 412 413 /** 414 * Returns: true if the given ID type is for a string literal. 415 */ 416 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 417 { 418 switch (type) 419 { 420 case tok!"dstringLiteral": 421 case tok!"stringLiteral": 422 case tok!"wstringLiteral": 423 return true; 424 default: 425 return false; 426 } 427 } 428 429 /** 430 * Returns: true if the given ID type is for a protection attribute. 431 */ 432 public bool isProtection(IdType type) pure nothrow @safe @nogc 433 { 434 switch (type) 435 { 436 case tok!"export": 437 case tok!"package": 438 case tok!"private": 439 case tok!"public": 440 case tok!"protected": 441 return true; 442 default: 443 return false; 444 } 445 } 446 447 /** 448 * Returns: an array of tokens lexed from the given source code to the output range. All 449 * whitespace tokens are skipped and comments are attached to the token nearest 450 * to them. 451 */ 452 const(Token)[] getTokensForParser(ubyte[] sourceCode, LexerConfig config, 453 StringCache* cache) 454 { 455 enum CommentType : ubyte 456 { 457 notDoc, 458 line, 459 block 460 } 461 462 static CommentType commentType(string comment) pure nothrow @safe 463 { 464 if (comment.length < 3) 465 return CommentType.notDoc; 466 if (comment[0 ..3] == "///") 467 return CommentType.line; 468 if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**") 469 return CommentType.block; 470 return CommentType.notDoc; 471 } 472 473 config.whitespaceBehavior = WhitespaceBehavior.skip; 474 475 auto output = appender!(typeof(return))(); 476 auto lexer = DLexer(sourceCode, config, cache); 477 string blockComment; 478 size_t tokenCount; 479 loop: while (!lexer.empty) switch (lexer.front.type) 480 { 481 case tok!"specialTokenSequence": 482 case tok!"whitespace": 483 lexer.popFront(); 484 break; 485 case tok!"comment": 486 final switch (commentType(lexer.front.text)) 487 { 488 case CommentType.block: 489 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 490 { 491 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text; 492 } 493 else 494 { 495 blockComment = lexer.front.text; 496 } 497 lexer.popFront(); 498 break; 499 case CommentType.line: 500 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 501 { 502 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text; 503 } 504 else 505 { 506 string c = lexer.front.text[3 .. $]; // just take the /// off entirely 507 if(blockComment.length == 0) { 508 blockComment = "/++" ~ c ~ "\n+/"; // just rewrite to this 509 } else { 510 import std.string; 511 auto l = blockComment.lastIndexOf("\n"); 512 string replacement; 513 if(l != -1) { 514 replacement = blockComment[l .. $]; 515 blockComment = blockComment[0 .. l + 1]; 516 } else { 517 replacement = blockComment[$-2 .. $]; 518 blockComment = blockComment[0 .. $-2]; // just cut off the */ or +/ 519 } 520 if(blockComment[0 .. 3] == "/**") 521 blockComment ~= c ~ replacement; 522 else if(blockComment[0 .. 3] == "/++") 523 blockComment ~= c ~ replacement; 524 else assert(0); 525 526 } 527 } 528 lexer.popFront(); 529 break; 530 case CommentType.notDoc: 531 lexer.popFront(); 532 break; 533 } 534 break; 535 case tok!"__EOF__": 536 break loop; 537 default: 538 Token t = lexer.front; 539 lexer.popFront(); 540 tokenCount++; 541 t.comment = blockComment; 542 blockComment = null; 543 output.put(t); 544 break; 545 } 546 return output.data; 547 } 548 549 /** 550 * The D lexer struct. 551 */ 552 public struct DLexer 553 { 554 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 555 keywords, pseudoTokenHandlers); 556 557 /// 558 @disable this(); 559 560 /** 561 * Params: 562 * range = the bytes that compose the source code that will be lexed. 563 * config = the lexer configuration to use. 564 * cache = the string interning cache for de-duplicating identifiers and 565 * other token text. 566 */ 567 this(ubyte[] range, const LexerConfig config, StringCache* cache, 568 bool haveSSE42 = sse42()) pure nothrow @safe 569 { 570 this.haveSSE42 = haveSSE42; 571 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 572 ? range[3 .. $] : range; 573 this.range = LexerRange(r); 574 this.config = config; 575 this.cache = cache; 576 popFront(); 577 } 578 579 /// 580 public void popFront()() pure nothrow @safe 581 { 582 do 583 _popFront(); 584 while (config.whitespaceBehavior == WhitespaceBehavior.skip 585 && _front.type == tok!"whitespace"); 586 } 587 588 private pure nothrow @safe: 589 590 bool isWhitespace() 591 { 592 switch (range.bytes[range.index]) 593 { 594 case ' ': 595 case '\r': 596 case '\n': 597 case '\t': 598 case '\v': 599 case '\f': 600 return true; 601 case 0xe2: 602 auto peek = range.peek(2); 603 return peek.length == 2 604 && peek[0] == 0x80 605 && (peek[1] == 0xa8 || peek[1] == 0xa9); 606 default: 607 return false; 608 } 609 } 610 611 void popFrontWhitespaceAware() 612 { 613 switch (range.bytes[range.index]) 614 { 615 case '\r': 616 range.popFront(); 617 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 618 { 619 range.popFront(); 620 range.incrementLine(); 621 } 622 else 623 range.incrementLine(); 624 return; 625 case '\n': 626 range.popFront(); 627 range.incrementLine(); 628 return; 629 case 0xe2: 630 auto lookahead = range.peek(3); 631 if (lookahead.length == 3 && lookahead[1] == 0x80 632 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 633 { 634 range.index+=3; 635 range.column+=3; 636 range.incrementLine(); 637 return; 638 } 639 else 640 { 641 range.popFront(); 642 return; 643 } 644 default: 645 range.popFront(); 646 return; 647 } 648 } 649 650 void lexWhitespace(ref Token token) @trusted 651 { 652 mixin (tokenStart); 653 loop: do 654 { 655 version (iasm64NotWindows) 656 { 657 if (haveSSE42 && range.index + 16 < range.bytes.length) 658 { 659 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 660 &range.index, &range.column); 661 } 662 } 663 switch (range.bytes[range.index]) 664 { 665 case '\r': 666 range.popFront(); 667 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 668 { 669 range.popFront(); 670 } 671 range.column = 1; 672 range.line += 1; 673 break; 674 case '\n': 675 range.popFront(); 676 range.column = 1; 677 range.line += 1; 678 break; 679 case ' ': 680 case '\t': 681 case '\v': 682 case '\f': 683 range.popFront(); 684 break; 685 case 0xe2: 686 if (range.index + 2 >= range.bytes.length) 687 break loop; 688 if (range.bytes[range.index + 1] != 0x80) 689 break loop; 690 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 691 { 692 range.index += 3; 693 range.column += 3; 694 range.column = 1; 695 range.line += 1; 696 break; 697 } 698 break loop; 699 default: 700 break loop; 701 } 702 } while (!(range.index >= range.bytes.length)); 703 end: 704 string text = config.whitespaceBehavior == WhitespaceBehavior.include 705 ? cache.intern(range.slice(mark)) : ""; 706 token = Token(tok!"whitespace", text, line, column, index); 707 } 708 709 void lexNumber(ref Token token) 710 { 711 mixin (tokenStart); 712 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 713 { 714 auto ahead = range.bytes[range.index + 1]; 715 switch (ahead) 716 { 717 case 'x': 718 case 'X': 719 range.index += 2; 720 range.column += 2; 721 lexHex(token, mark, line, column, index); 722 return; 723 case 'b': 724 case 'B': 725 range.index += 2; 726 range.column += 2; 727 lexBinary(token, mark, line, column, index); 728 return; 729 default: 730 lexDecimal(token, mark, line, column, index); 731 return; 732 } 733 } 734 else 735 lexDecimal(token, mark, line, column, index); 736 } 737 738 void lexHex(ref Token token) 739 { 740 mixin (tokenStart); 741 lexHex(token, mark, line, column, index); 742 } 743 744 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 745 size_t index) @trusted 746 { 747 IdType type = tok!"intLiteral"; 748 bool foundDot; 749 hexLoop: while (!(range.index >= range.bytes.length)) 750 { 751 switch (range.bytes[range.index]) 752 { 753 case 'a': .. case 'f': 754 case 'A': .. case 'F': 755 case '0': .. case '9': 756 case '_': 757 version (iasm64NotWindows) 758 { 759 if (haveSSE42 && range.index + 16 < range.bytes.length) 760 { 761 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 762 (range.bytes.ptr + range.index); 763 range.column += i; 764 range.index += i; 765 } 766 else 767 range.popFront(); 768 } 769 else 770 range.popFront(); 771 break; 772 case 'u': 773 case 'U': 774 lexIntSuffix(type); 775 break hexLoop; 776 case 'i': 777 if (foundDot) 778 lexFloatSuffix(type); 779 break hexLoop; 780 case 'L': 781 if (foundDot) 782 lexFloatSuffix(type); 783 else 784 lexIntSuffix(type); 785 break hexLoop; 786 case 'p': 787 case 'P': 788 lexExponent(type); 789 break hexLoop; 790 case '.': 791 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 792 break hexLoop; 793 else 794 { 795 // The following bit of silliness tries to tell the 796 // difference between "int dot identifier" and 797 // "double identifier". 798 if ((range.index + 1 < range.bytes.length)) 799 { 800 switch (range.peekAt(1)) 801 { 802 case '0': .. case '9': 803 case 'A': .. case 'F': 804 case 'a': .. case 'f': 805 goto doubleLiteral; 806 default: 807 break hexLoop; 808 } 809 } 810 else 811 { 812 doubleLiteral: 813 range.popFront(); 814 foundDot = true; 815 type = tok!"doubleLiteral"; 816 } 817 } 818 break; 819 default: 820 break hexLoop; 821 } 822 } 823 token = Token(type, cache.intern(range.slice(mark)), line, column, 824 index); 825 } 826 827 void lexBinary(ref Token token) 828 { 829 mixin (tokenStart); 830 return lexBinary(token, mark, line, column, index); 831 } 832 833 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 834 size_t index) @trusted 835 { 836 IdType type = tok!"intLiteral"; 837 binaryLoop: while (!(range.index >= range.bytes.length)) 838 { 839 switch (range.bytes[range.index]) 840 { 841 case '0': 842 case '1': 843 case '_': 844 version (iasm64NotWindows) 845 { 846 if (haveSSE42 && range.index + 16 < range.bytes.length) 847 { 848 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 849 range.bytes.ptr + range.index); 850 range.column += i; 851 range.index += i; 852 } 853 else 854 range.popFront(); 855 } 856 else 857 range.popFront(); 858 break; 859 case 'u': 860 case 'U': 861 case 'L': 862 lexIntSuffix(type); 863 break binaryLoop; 864 default: 865 break binaryLoop; 866 } 867 } 868 token = Token(type, cache.intern(range.slice(mark)), line, column, 869 index); 870 } 871 872 void lexDecimal(ref Token token) 873 { 874 mixin (tokenStart); 875 lexDecimal(token, mark, line, column, index); 876 } 877 878 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 879 size_t index) @trusted 880 { 881 bool foundDot = range.bytes[range.index] == '.'; 882 IdType type = tok!"intLiteral"; 883 if (foundDot) 884 { 885 range.popFront(); 886 type = tok!"doubleLiteral"; 887 } 888 889 decimalLoop: while (!(range.index >= range.bytes.length)) 890 { 891 switch (range.bytes[range.index]) 892 { 893 case '0': .. case '9': 894 case '_': 895 version (iasm64NotWindows) 896 { 897 if (haveSSE42 && range.index + 16 < range.bytes.length) 898 { 899 ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 900 range.column += i; 901 range.index += i; 902 } 903 else 904 range.popFront(); 905 } 906 else 907 range.popFront(); 908 break; 909 case 'u': 910 case 'U': 911 if (!foundDot) 912 lexIntSuffix(type); 913 break decimalLoop; 914 case 'i': 915 lexFloatSuffix(type); 916 break decimalLoop; 917 case 'L': 918 if (foundDot) 919 lexFloatSuffix(type); 920 else 921 lexIntSuffix(type); 922 break decimalLoop; 923 case 'f': 924 case 'F': 925 lexFloatSuffix(type); 926 break decimalLoop; 927 case 'e': 928 case 'E': 929 lexExponent(type); 930 break decimalLoop; 931 case '.': 932 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 933 break decimalLoop; 934 else 935 { 936 // The following bit of silliness tries to tell the 937 // difference between "int dot identifier" and 938 // "double identifier". 939 if ((range.index + 1 < range.bytes.length)) 940 { 941 auto ch = range.peekAt(1); 942 if (ch <= 0x2f 943 || (ch >= '0' && ch <= '9') 944 || (ch >= ':' && ch <= '@') 945 || (ch >= '[' && ch <= '^') 946 || (ch >= '{' && ch <= '~') 947 || ch == '`' || ch == '_') 948 { 949 goto doubleLiteral; 950 } 951 else 952 break decimalLoop; 953 } 954 else 955 { 956 doubleLiteral: 957 range.popFront(); 958 foundDot = true; 959 type = tok!"doubleLiteral"; 960 } 961 } 962 break; 963 default: 964 break decimalLoop; 965 } 966 } 967 token = Token(type, cache.intern(range.slice(mark)), line, column, 968 index); 969 } 970 971 void lexIntSuffix(ref IdType type) 972 { 973 bool secondPass; 974 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 975 { 976 U: 977 if (type == tok!"intLiteral") 978 type = tok!"uintLiteral"; 979 else 980 type = tok!"ulongLiteral"; 981 range.popFront(); 982 if (secondPass) 983 return; 984 if (range.index < range.bytes.length && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 985 goto L; 986 return; 987 } 988 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 989 { 990 L: 991 if (type == tok!"uintLiteral") 992 type = tok!"ulongLiteral"; 993 else 994 type = tok!"longLiteral"; 995 range.popFront(); 996 if (!secondPass && range.index < range.bytes.length && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 997 { 998 secondPass = true; 999 goto U; 1000 } 1001 return; 1002 } 1003 } 1004 1005 void lexFloatSuffix(ref IdType type) pure nothrow @safe 1006 { 1007 switch (range.bytes[range.index]) 1008 { 1009 case 'L': 1010 range.popFront(); 1011 type = tok!"doubleLiteral"; 1012 break; 1013 case 'f': 1014 case 'F': 1015 range.popFront(); 1016 type = tok!"floatLiteral"; 1017 break; 1018 default: 1019 break; 1020 } 1021 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == 'i') 1022 { 1023 warning("Complex number literals are deprecated"); 1024 range.popFront(); 1025 if (type == tok!"floatLiteral") 1026 type = tok!"ifloatLiteral"; 1027 else 1028 type = tok!"idoubleLiteral"; 1029 } 1030 } 1031 1032 void lexExponent(ref IdType type) pure nothrow @safe 1033 { 1034 range.popFront(); 1035 bool foundSign = false; 1036 bool foundDigit = false; 1037 while (!(range.index >= range.bytes.length)) 1038 { 1039 switch (range.bytes[range.index]) 1040 { 1041 case '-': 1042 case '+': 1043 if (foundSign) 1044 { 1045 if (!foundDigit) 1046 error("Expected an exponent"); 1047 return; 1048 } 1049 foundSign = true; 1050 range.popFront(); 1051 break; 1052 case '0': .. case '9': 1053 case '_': 1054 foundDigit = true; 1055 range.popFront(); 1056 break; 1057 case 'L': 1058 case 'f': 1059 case 'F': 1060 case 'i': 1061 lexFloatSuffix(type); 1062 return; 1063 default: 1064 if (!foundDigit) 1065 error("Expected an exponent"); 1066 return; 1067 } 1068 } 1069 } 1070 1071 void lexScriptLine(ref Token token) 1072 { 1073 mixin (tokenStart); 1074 while (!(range.index >= range.bytes.length) && !isNewline) 1075 { 1076 range.popFront(); 1077 } 1078 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1079 line, column, index); 1080 } 1081 1082 void lexSpecialTokenSequence(ref Token token) 1083 { 1084 mixin (tokenStart); 1085 while (!(range.index >= range.bytes.length) && !isNewline) 1086 { 1087 range.popFront(); 1088 } 1089 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1090 line, column, index); 1091 } 1092 1093 void lexSlashStarComment(ref Token token) @trusted 1094 { 1095 mixin (tokenStart); 1096 IdType type = tok!"comment"; 1097 range.popFrontN(2); 1098 while (range.index < range.bytes.length) 1099 { 1100 version (iasm64NotWindows) 1101 { 1102 if (haveSSE42 && range.index + 16 < range.bytes.length) 1103 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1104 &range.index, &range.column); 1105 } 1106 if (range.bytes[range.index] == '*') 1107 { 1108 range.popFront(); 1109 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1110 { 1111 range.popFront(); 1112 break; 1113 } 1114 } 1115 else 1116 popFrontWhitespaceAware(); 1117 } 1118 end: 1119 token = Token(type, cache.intern(range.slice(mark)), line, column, 1120 index); 1121 } 1122 1123 void lexSlashSlashComment(ref Token token) @trusted 1124 { 1125 mixin (tokenStart); 1126 IdType type = tok!"comment"; 1127 range.popFrontN(2); 1128 while (range.index < range.bytes.length) 1129 { 1130 version (iasm64NotWindows) 1131 { 1132 if (haveSSE42 && range.index + 16 < range.bytes.length) 1133 { 1134 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1135 &range.index, &range.column); 1136 } 1137 } 1138 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1139 break; 1140 range.popFront(); 1141 } 1142 end: 1143 token = Token(type, cache.intern(range.slice(mark)), line, column, 1144 index); 1145 } 1146 1147 void lexSlashPlusComment(ref Token token) @trusted 1148 { 1149 mixin (tokenStart); 1150 IdType type = tok!"comment"; 1151 range.index += 2; 1152 range.column += 2; 1153 int depth = 1; 1154 while (depth > 0 && !(range.index >= range.bytes.length)) 1155 { 1156 version (iasm64NotWindows) 1157 { 1158 if (haveSSE42 && range.index + 16 < range.bytes.length) 1159 { 1160 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1161 &range.index, &range.column); 1162 } 1163 } 1164 if (range.bytes[range.index] == '+') 1165 { 1166 range.popFront(); 1167 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1168 { 1169 range.popFront(); 1170 depth--; 1171 } 1172 } 1173 else if (range.bytes[range.index] == '/') 1174 { 1175 range.popFront(); 1176 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1177 { 1178 range.popFront(); 1179 depth++; 1180 } 1181 } 1182 else 1183 popFrontWhitespaceAware(); 1184 } 1185 token = Token(type, cache.intern(range.slice(mark)), line, column, 1186 index); 1187 } 1188 1189 void lexStringLiteral(ref Token token) @trusted 1190 { 1191 mixin (tokenStart); 1192 range.popFront(); 1193 while (true) 1194 { 1195 if ((range.index >= range.bytes.length)) 1196 { 1197 error("Error: unterminated string literal"); 1198 token = Token(tok!""); 1199 return; 1200 } 1201 version (iasm64NotWindows) 1202 { 1203 if (haveSSE42 && range.index + 16 < range.bytes.length) 1204 { 1205 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1206 &range.index, &range.column); 1207 } 1208 } 1209 if (range.bytes[range.index] == '"') 1210 { 1211 range.popFront(); 1212 break; 1213 } 1214 else if (range.bytes[range.index] == '\\') 1215 { 1216 lexEscapeSequence(); 1217 } 1218 else 1219 popFrontWhitespaceAware(); 1220 } 1221 IdType type = tok!"stringLiteral"; 1222 lexStringSuffix(type); 1223 token = Token(type, cache.intern(range.slice(mark)), line, column, 1224 index); 1225 } 1226 1227 void lexWysiwygString(ref Token token) @trusted 1228 { 1229 mixin (tokenStart); 1230 IdType type = tok!"stringLiteral"; 1231 bool backtick = range.bytes[range.index] == '`'; 1232 if (backtick) 1233 { 1234 range.popFront(); 1235 while (true) 1236 { 1237 if ((range.index >= range.bytes.length)) 1238 { 1239 error("Error: unterminated string literal"); 1240 token = Token(tok!""); 1241 return; 1242 } 1243 version (iasm64NotWindows) 1244 { 1245 if (haveSSE42 && range.index + 16 < range.bytes.length) 1246 { 1247 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1248 &range.index, &range.column); 1249 } 1250 } 1251 if (range.bytes[range.index] == '`') 1252 { 1253 range.popFront(); 1254 break; 1255 } 1256 else 1257 popFrontWhitespaceAware(); 1258 } 1259 } 1260 else 1261 { 1262 range.popFront(); 1263 if ((range.index >= range.bytes.length)) 1264 { 1265 error("Error: unterminated string literal"); 1266 token = Token(tok!""); 1267 return; 1268 } 1269 range.popFront(); 1270 while (true) 1271 { 1272 if ((range.index >= range.bytes.length)) 1273 { 1274 error("Error: unterminated string literal"); 1275 token = Token(tok!""); 1276 return; 1277 } 1278 else if (range.bytes[range.index] == '"') 1279 { 1280 range.popFront(); 1281 break; 1282 } 1283 else 1284 popFrontWhitespaceAware(); 1285 } 1286 } 1287 lexStringSuffix(type); 1288 token = Token(type, cache.intern(range.slice(mark)), line, column, 1289 index); 1290 } 1291 1292 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1293 { 1294 if ((range.index >= range.bytes.length)) 1295 { 1296 type = tok!"stringLiteral"; 1297 return 0; 1298 } 1299 else 1300 { 1301 switch (range.bytes[range.index]) 1302 { 1303 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1304 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1305 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1306 default: type = tok!"stringLiteral"; return 0; 1307 } 1308 } 1309 } 1310 1311 void lexDelimitedString(ref Token token) 1312 { 1313 mixin (tokenStart); 1314 range.index += 2; 1315 range.column += 2; 1316 ubyte open; 1317 ubyte close; 1318 switch (range.bytes[range.index]) 1319 { 1320 case '<': 1321 open = '<'; 1322 close = '>'; 1323 range.popFront(); 1324 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1325 break; 1326 case '{': 1327 open = '{'; 1328 close = '}'; 1329 range.popFront(); 1330 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1331 break; 1332 case '[': 1333 open = '['; 1334 close = ']'; 1335 range.popFront(); 1336 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1337 break; 1338 case '(': 1339 open = '('; 1340 close = ')'; 1341 range.popFront(); 1342 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1343 break; 1344 default: 1345 lexHeredocString(token, mark, line, column, index); 1346 break; 1347 } 1348 } 1349 1350 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1351 size_t index, ubyte open, ubyte close) 1352 { 1353 int depth = 1; 1354 while (!(range.index >= range.bytes.length) && depth > 0) 1355 { 1356 if (range.bytes[range.index] == open) 1357 { 1358 depth++; 1359 range.popFront(); 1360 } 1361 else if (range.bytes[range.index] == close) 1362 { 1363 depth--; 1364 range.popFront(); 1365 if (depth <= 0) 1366 { 1367 if (range.bytes[range.index] == '"') 1368 { 1369 range.popFront(); 1370 } 1371 else 1372 { 1373 error("Error: \" expected to end delimited string literal"); 1374 token = Token(tok!""); 1375 return; 1376 } 1377 } 1378 } 1379 else 1380 popFrontWhitespaceAware(); 1381 } 1382 IdType type = tok!"stringLiteral"; 1383 lexStringSuffix(type); 1384 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1385 } 1386 1387 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1388 { 1389 Token ident; 1390 lexIdentifier(ident); 1391 if (isNewline()) 1392 popFrontWhitespaceAware(); 1393 else 1394 error("Newline expected"); 1395 while (!(range.index >= range.bytes.length)) 1396 { 1397 if (isNewline()) 1398 { 1399 popFrontWhitespaceAware(); 1400 if (!range.canPeek(ident.text.length)) 1401 { 1402 error(ident.text ~ " expected"); 1403 break; 1404 } 1405 if (range.peek(ident.text.length - 1) == ident.text) 1406 { 1407 range.popFrontN(ident.text.length); 1408 break; 1409 } 1410 } 1411 else 1412 { 1413 range.popFront(); 1414 } 1415 } 1416 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1417 { 1418 range.popFront(); 1419 } 1420 else 1421 error(`" expected`); 1422 IdType type = tok!"stringLiteral"; 1423 lexStringSuffix(type); 1424 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1425 } 1426 1427 void lexTokenString(ref Token token) 1428 { 1429 mixin (tokenStart); 1430 assert (range.bytes[range.index] == 'q'); 1431 range.popFront(); 1432 assert (range.bytes[range.index] == '{'); 1433 range.popFront(); 1434 auto app = appender!string(); 1435 app.put("q{"); 1436 int depth = 1; 1437 1438 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1439 immutable StringBehavior oldString = config.stringBehavior; 1440 config.whitespaceBehavior = WhitespaceBehavior.include; 1441 config.stringBehavior = StringBehavior.source; 1442 scope (exit) 1443 { 1444 config.whitespaceBehavior = oldWhitespace; 1445 config.stringBehavior = oldString; 1446 } 1447 1448 advance(_front); 1449 while (depth > 0 && !empty) 1450 { 1451 auto t = front(); 1452 if (t.text is null) 1453 app.put(str(t.type)); 1454 else 1455 app.put(t.text); 1456 if (t.type == tok!"}") 1457 { 1458 depth--; 1459 if (depth > 0) 1460 popFront(); 1461 } 1462 else if (t.type == tok!"{") 1463 { 1464 depth++; 1465 popFront(); 1466 } 1467 else 1468 popFront(); 1469 } 1470 IdType type = tok!"stringLiteral"; 1471 auto b = lexStringSuffix(type); 1472 if (b != 0) 1473 app.put(b); 1474 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1475 column, index); 1476 } 1477 1478 void lexHexString(ref Token token) 1479 { 1480 mixin (tokenStart); 1481 range.index += 2; 1482 range.column += 2; 1483 1484 loop: while (true) 1485 { 1486 if ((range.index >= range.bytes.length)) 1487 { 1488 error("Error: unterminated hex string literal"); 1489 token = Token(tok!""); 1490 return; 1491 } 1492 else if (isWhitespace()) 1493 popFrontWhitespaceAware(); 1494 else switch (range.bytes[range.index]) 1495 { 1496 case '0': .. case '9': 1497 case 'A': .. case 'F': 1498 case 'a': .. case 'f': 1499 range.popFront(); 1500 break; 1501 case '"': 1502 range.popFront(); 1503 break loop; 1504 default: 1505 error("Error: invalid character in hex string"); 1506 token = Token(tok!""); 1507 return; 1508 } 1509 } 1510 1511 IdType type = tok!"stringLiteral"; 1512 lexStringSuffix(type); 1513 token = Token(type, cache.intern(range.slice(mark)), line, column, 1514 index); 1515 } 1516 1517 bool lexEscapeSequence() 1518 { 1519 range.popFront(); 1520 if ((range.index >= range.bytes.length)) 1521 { 1522 error("Error: non-terminated character escape sequence."); 1523 return false; 1524 } 1525 switch (range.bytes[range.index]) 1526 { 1527 case '\'': 1528 case '"': 1529 case '?': 1530 case '\\': 1531 case 'a': 1532 case 'b': 1533 case 'f': 1534 case 'n': 1535 case 'r': 1536 case 't': 1537 case 'v': 1538 range.popFront(); 1539 break; 1540 case 'x': 1541 range.popFront(); 1542 foreach (i; 0 .. 2) 1543 { 1544 if ((range.index >= range.bytes.length)) 1545 { 1546 error("Error: 2 hex digits expected."); 1547 return false; 1548 } 1549 switch (range.bytes[range.index]) 1550 { 1551 case '0': .. case '9': 1552 case 'a': .. case 'f': 1553 case 'A': .. case 'F': 1554 range.popFront(); 1555 break; 1556 default: 1557 error("Error: 2 hex digits expected."); 1558 return false; 1559 } 1560 } 1561 break; 1562 case '0': 1563 if (!(range.index + 1 < range.bytes.length) || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1564 { 1565 range.popFront(); 1566 break; 1567 } 1568 goto case; 1569 case '1': .. case '7': 1570 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1571 range.popFront(); 1572 break; 1573 case 'u': 1574 range.popFront(); 1575 foreach (i; 0 .. 4) 1576 { 1577 if ((range.index >= range.bytes.length)) 1578 { 1579 error("Error: at least 4 hex digits expected."); 1580 return false; 1581 } 1582 switch (range.bytes[range.index]) 1583 { 1584 case '0': .. case '9': 1585 case 'a': .. case 'f': 1586 case 'A': .. case 'F': 1587 range.popFront(); 1588 break; 1589 default: 1590 error("Error: at least 4 hex digits expected."); 1591 return false; 1592 } 1593 } 1594 break; 1595 case 'U': 1596 range.popFront(); 1597 foreach (i; 0 .. 8) 1598 { 1599 if ((range.index >= range.bytes.length)) 1600 { 1601 error("Error: at least 8 hex digits expected."); 1602 return false; 1603 } 1604 switch (range.bytes[range.index]) 1605 { 1606 case '0': .. case '9': 1607 case 'a': .. case 'f': 1608 case 'A': .. case 'F': 1609 range.popFront(); 1610 break; 1611 default: 1612 error("Error: at least 8 hex digits expected."); 1613 return false; 1614 } 1615 } 1616 break; 1617 default: 1618 while (true) 1619 { 1620 if ((range.index >= range.bytes.length)) 1621 { 1622 error("Error: non-terminated character escape sequence."); 1623 return false; 1624 } 1625 if (range.bytes[range.index] == ';') 1626 { 1627 range.popFront(); 1628 break; 1629 } 1630 else 1631 { 1632 range.popFront(); 1633 } 1634 } 1635 } 1636 return true; 1637 } 1638 1639 void lexCharacterLiteral(ref Token token) 1640 { 1641 mixin (tokenStart); 1642 range.popFront(); 1643 if (range.bytes[range.index] == '\\') 1644 { 1645 lexEscapeSequence(); 1646 goto close; 1647 } 1648 else if (range.bytes[range.index] == '\'') 1649 { 1650 range.popFront(); 1651 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1652 line, column, index); 1653 } 1654 else if (range.bytes[range.index] & 0x80) 1655 { 1656 while (range.bytes[range.index] & 0x80) 1657 { 1658 range.popFront(); 1659 } 1660 goto close; 1661 } 1662 else 1663 { 1664 popFrontWhitespaceAware(); 1665 goto close; 1666 } 1667 close: 1668 if (range.index < range.bytes.length && range.bytes[range.index] == '\'') 1669 { 1670 range.popFront(); 1671 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1672 line, column, index); 1673 } 1674 else 1675 { 1676 error("Error: Expected ' to end character literal"); 1677 token = Token(tok!""); 1678 } 1679 } 1680 1681 void lexIdentifier(ref Token token) @trusted 1682 { 1683 mixin (tokenStart); 1684 if (isSeparating(0)) 1685 { 1686 error("Invalid identifier"); 1687 range.popFront(); 1688 } 1689 while (true) 1690 { 1691 version (iasm64NotWindows) 1692 { 1693 if (haveSSE42 && range.index + 16 < range.bytes.length) 1694 { 1695 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1696 (range.bytes.ptr + range.index); 1697 range.column += i; 1698 range.index += i; 1699 } 1700 } 1701 if (isSeparating(0)) 1702 break; 1703 else 1704 range.popFront(); 1705 } 1706 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1707 column, index); 1708 } 1709 1710 void lexDot(ref Token token) 1711 { 1712 mixin (tokenStart); 1713 if (!(range.index + 1 < range.bytes.length)) 1714 { 1715 range.popFront(); 1716 token = Token(tok!".", null, line, column, index); 1717 return; 1718 } 1719 switch (range.peekAt(1)) 1720 { 1721 case '0': .. case '9': 1722 lexNumber(token); 1723 return; 1724 case '.': 1725 range.popFront(); 1726 range.popFront(); 1727 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1728 { 1729 range.popFront(); 1730 token = Token(tok!"...", null, line, column, index); 1731 } 1732 else 1733 token = Token(tok!"..", null, line, column, index); 1734 return; 1735 default: 1736 range.popFront(); 1737 token = Token(tok!".", null, line, column, index); 1738 return; 1739 } 1740 } 1741 1742 void lexLongNewline(ref Token token) @nogc 1743 { 1744 mixin (tokenStart); 1745 range.popFront(); 1746 range.popFront(); 1747 range.popFront(); 1748 range.incrementLine(); 1749 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1750 ? cache.intern(range.slice(mark)) : ""; 1751 token = Token(tok!"whitespace", text, line, 1752 column, index); 1753 } 1754 1755 bool isNewline() @nogc 1756 { 1757 if (range.bytes[range.index] == '\n') return true; 1758 if (range.bytes[range.index] == '\r') return true; 1759 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1760 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1761 } 1762 1763 bool isSeparating(size_t offset) @nogc 1764 { 1765 enum : ubyte 1766 { 1767 n, y, m // no, yes, maybe 1768 } 1769 1770 if (range.index + offset >= range.bytes.length) 1771 return true; 1772 auto c = range.bytes[range.index + offset]; 1773 static immutable ubyte[256] LOOKUP_TABLE = [ 1774 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1775 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1776 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1777 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1778 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1779 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1780 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1781 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1782 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1783 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1784 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1785 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1786 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1787 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1788 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1789 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1790 ]; 1791 immutable ubyte result = LOOKUP_TABLE[c]; 1792 if (result == n) 1793 return false; 1794 if (result == y) 1795 return true; 1796 if (result == m) 1797 { 1798 auto r = range; 1799 range.popFrontN(offset); 1800 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1801 || r.peek(2) == "\u2029")); 1802 } 1803 assert (false); 1804 } 1805 1806 1807 1808 enum tokenStart = q{ 1809 size_t index = range.index; 1810 size_t column = range.column; 1811 size_t line = range.line; 1812 auto mark = range.mark(); 1813 }; 1814 1815 void error(string message) 1816 { 1817 messages ~= Message(range.line, range.column, message, true); 1818 } 1819 1820 void warning(string message) 1821 { 1822 messages ~= Message(range.line, range.column, message, false); 1823 assert (messages.length > 0); 1824 } 1825 1826 static struct Message 1827 { 1828 size_t line; 1829 size_t column; 1830 string message; 1831 bool isError; 1832 } 1833 1834 Message[] messages; 1835 StringCache* cache; 1836 LexerConfig config; 1837 bool haveSSE42; 1838 } 1839 1840 /** 1841 * Creates a token range from the given source code. Creates a default lexer 1842 * configuration and a GC-managed string cache. 1843 */ 1844 public auto byToken(ubyte[] range) 1845 { 1846 LexerConfig config; 1847 StringCache* cache = new StringCache(StringCache.defaultBucketCount); 1848 return DLexer(range, config, cache); 1849 } 1850 1851 /** 1852 * Creates a token range from the given source code. Uses the given string 1853 * cache. 1854 */ 1855 public auto byToken(ubyte[] range, StringCache* cache) 1856 { 1857 LexerConfig config; 1858 return DLexer(range, config, cache); 1859 } 1860 1861 /** 1862 * Creates a token range from the given source code. Uses the provided lexer 1863 * configuration and string cache. 1864 */ 1865 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache) 1866 { 1867 return DLexer(range, config, cache); 1868 } 1869 1870 /** 1871 * Removes "decoration" such as leading whitespace, leading + and * characters, 1872 * and places the result into the given output range 1873 */ 1874 public void unDecorateComment(T)(string comment, auto ref T outputRange) 1875 if (isOutputRange!(T, string)) 1876 in 1877 { 1878 assert (comment.length >= 3); 1879 } 1880 body 1881 { 1882 switch (comment[0 .. 3]) 1883 { 1884 case "///": 1885 size_t i = 3; 1886 if (i < comment.length) 1887 { 1888 again: 1889 while (i < comment.length && (comment[i] == ' ' || comment[i] == '\t')) 1890 i++; 1891 size_t j = i + 1; 1892 while (j < comment.length) 1893 { 1894 if (comment[j] == '\r') 1895 j++; 1896 if (j >= comment.length) 1897 break; 1898 if (comment[j] == '\n') 1899 { 1900 outputRange.put(comment[i .. j]); 1901 j++; 1902 while (j < comment.length && comment[j] == '/') 1903 j++; 1904 outputRange.put('\n'); 1905 i = j; 1906 goto again; 1907 } 1908 j++; 1909 } 1910 if (i < comment.length && j <= comment.length) 1911 outputRange.put(comment[i .. j]); 1912 } 1913 break; 1914 case "/++": 1915 case "/**": 1916 size_t i = 3; 1917 immutable char c = comment[1]; 1918 // Skip leading * and + characters 1919 while (comment[i] == c) i++; 1920 // Skip trailing * and + characters 1921 size_t j = comment.length - 2; 1922 while (j > i && comment[j] == c) 1923 j--; 1924 while (j > i && (comment[j] == ' ' || comment[j] == '\t')) 1925 j--; 1926 j++; 1927 size_t k = i; 1928 while (k < j) 1929 { 1930 if (comment[k] == '\n') 1931 { 1932 k++; 1933 break; 1934 } 1935 k++; 1936 } 1937 outputRange.put(comment[i .. k]); 1938 i = k; 1939 if (comment[i] == '\r') i++; 1940 if (comment[i] == '\n') i++; 1941 while (comment[i] == ' ' || comment[i] == '\t') i++; 1942 immutable bool skipBeginningChar = comment[i] == c; 1943 if (skipBeginningChar) 1944 i++; 1945 size_t whitespaceToSkip; 1946 while (comment[i] == ' ' || comment[i] == '\t') 1947 { 1948 whitespaceToSkip++; 1949 i++; 1950 } 1951 size_t l = i; 1952 while (i < j) 1953 { 1954 if (comment[i++] == '\n') 1955 break; 1956 } 1957 outputRange.put(comment[l .. i]); 1958 while (true) 1959 { 1960 if (skipBeginningChar) 1961 { 1962 while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++; 1963 if (i < j && comment[i] == c) i++; 1964 } 1965 for (size_t s = 0; (i < j) && (s < whitespaceToSkip) 1966 && (comment[i] == ' ' || comment[i] == '\t');) 1967 { 1968 s++; 1969 i++; 1970 } 1971 k = i; 1972 inner: while (k < j) 1973 { 1974 if (comment[k] == '\n') 1975 { 1976 k++; 1977 break inner; 1978 } 1979 k++; 1980 } 1981 outputRange.put(comment[i .. k]); 1982 i = k; 1983 if (i >= j) 1984 break; 1985 } 1986 break; 1987 default: 1988 outputRange.put(comment); 1989 break; 1990 } 1991 } 1992 1993 1994 /** 1995 * The string cache is used for string interning. 1996 * 1997 * It will only store a single copy of any string that it is asked to hold. 1998 * Interned strings can be compared for equality by comparing their $(B .ptr) 1999 * field. 2000 * 2001 * Default and postbilt constructors are disabled. When a StringCache goes out 2002 * of scope, the memory held by it is freed. 2003 * 2004 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 2005 */ 2006 struct StringCache 2007 { 2008 public pure nothrow @nogc: 2009 2010 @disable this(); 2011 @disable this(this); 2012 2013 /** 2014 * Params: bucketCount = the initial number of buckets. Must be a 2015 * power of two 2016 */ 2017 this(size_t bucketCount) nothrow @trusted @nogc 2018 in 2019 { 2020 import core.bitop : popcnt; 2021 static if (size_t.sizeof == 8) 2022 { 2023 immutable low = popcnt(cast(uint) bucketCount); 2024 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 2025 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 2026 } 2027 else 2028 { 2029 static assert (size_t.sizeof == 4); 2030 assert (popcnt(cast(uint) bucketCount) == 1); 2031 } 2032 } 2033 body 2034 { 2035 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 2036 } 2037 2038 void freeItAll() 2039 { 2040 Block* current = rootBlock; 2041 while (current !is null) 2042 { 2043 Block* prev = current; 2044 current = current.next; 2045 free(cast(void*) prev); 2046 } 2047 foreach (nodePointer; buckets) 2048 { 2049 Node* currentNode = nodePointer; 2050 while (currentNode !is null) 2051 { 2052 if (currentNode.mallocated) 2053 free(currentNode.str.ptr); 2054 Node* prev = currentNode; 2055 currentNode = currentNode.next; 2056 free(prev); 2057 } 2058 } 2059 rootBlock = null; 2060 free(buckets.ptr); 2061 buckets = null; 2062 } 2063 2064 /** 2065 * Caches a string. 2066 */ 2067 string intern(const(ubyte)[] str) @safe 2068 { 2069 if (str is null || str.length == 0) 2070 return ""; 2071 return _intern(str); 2072 } 2073 2074 /** 2075 * ditto 2076 */ 2077 string intern(string str) @trusted 2078 { 2079 return intern(cast(ubyte[]) str); 2080 } 2081 2082 /** 2083 * The default bucket count for the string cache. 2084 */ 2085 static enum defaultBucketCount = 4096; 2086 2087 private: 2088 2089 string _intern(const(ubyte)[] bytes) @trusted 2090 { 2091 immutable uint hash = hashBytes(bytes); 2092 immutable size_t index = hash & (buckets.length - 1); 2093 Node* s = find(bytes, hash); 2094 if (s !is null) 2095 return cast(string) s.str; 2096 ubyte[] mem = void; 2097 bool mallocated = bytes.length > BIG_STRING; 2098 if (mallocated) 2099 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2100 else 2101 mem = allocate(bytes.length); 2102 mem[] = bytes[]; 2103 Node* node = cast(Node*) malloc(Node.sizeof); 2104 node.str = mem; 2105 node.hash = hash; 2106 node.next = buckets[index]; 2107 node.mallocated = mallocated; 2108 buckets[index] = node; 2109 return cast(string) mem; 2110 } 2111 2112 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2113 { 2114 import std.algorithm : equal; 2115 immutable size_t index = hash & (buckets.length - 1); 2116 Node* node = buckets[index]; 2117 while (node !is null) 2118 { 2119 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2120 return node; 2121 node = node.next; 2122 } 2123 return node; 2124 } 2125 2126 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2127 in 2128 { 2129 assert (data !is null); 2130 assert (data.length > 0); 2131 } 2132 body 2133 { 2134 immutable uint m = 0x5bd1e995; 2135 immutable int r = 24; 2136 uint h = cast(uint) data.length; 2137 while (data.length >= 4) 2138 { 2139 uint k = (cast(ubyte) data[3]) << 24 2140 | (cast(ubyte) data[2]) << 16 2141 | (cast(ubyte) data[1]) << 8 2142 | (cast(ubyte) data[0]); 2143 k *= m; 2144 k ^= k >> r; 2145 k *= m; 2146 h *= m; 2147 h ^= k; 2148 data = data[4 .. $]; 2149 } 2150 switch (data.length & 3) 2151 { 2152 case 3: 2153 h ^= data[2] << 16; 2154 goto case; 2155 case 2: 2156 h ^= data[1] << 8; 2157 goto case; 2158 case 1: 2159 h ^= data[0]; 2160 h *= m; 2161 break; 2162 default: 2163 break; 2164 } 2165 h ^= h >> 13; 2166 h *= m; 2167 h ^= h >> 15; 2168 return h; 2169 } 2170 2171 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2172 in 2173 { 2174 assert (numBytes != 0); 2175 } 2176 out (result) 2177 { 2178 assert (result.length == numBytes); 2179 } 2180 body 2181 { 2182 Block* r = rootBlock; 2183 size_t i = 0; 2184 while (i <= 3 && r !is null) 2185 { 2186 immutable size_t available = r.bytes.length; 2187 immutable size_t oldUsed = r.used; 2188 immutable size_t newUsed = oldUsed + numBytes; 2189 if (newUsed <= available) 2190 { 2191 r.used = newUsed; 2192 return r.bytes[oldUsed .. newUsed]; 2193 } 2194 i++; 2195 r = r.next; 2196 } 2197 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2198 b.used = numBytes; 2199 b.next = rootBlock; 2200 rootBlock = b; 2201 return b.bytes[0 .. numBytes]; 2202 } 2203 2204 static struct Node 2205 { 2206 ubyte[] str = void; 2207 Node* next = void; 2208 uint hash = void; 2209 bool mallocated = void; 2210 } 2211 2212 static struct Block 2213 { 2214 Block* next; 2215 size_t used; 2216 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2217 ubyte[BLOCK_CAPACITY] bytes; 2218 } 2219 2220 static assert (BLOCK_SIZE == Block.sizeof); 2221 2222 enum BLOCK_SIZE = 1024 * 16; 2223 2224 // If a string would take up more than 1/4 of a block, allocate it outside 2225 // of the block. 2226 enum BIG_STRING = BLOCK_SIZE / 4; 2227 2228 Node*[] buckets; 2229 Block* rootBlock; 2230 } 2231 2232 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2233 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2234 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2235 2236 unittest 2237 { 2238 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2239 auto tokens = getTokensForParser(source, LexerConfig(), 2240 new StringCache(StringCache.defaultBucketCount)); 2241 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2242 tok!"identifier", tok!";"])); 2243 } 2244 2245 /// Test \x char sequence 2246 unittest 2247 { 2248 auto toks = (string s) => byToken(cast(ubyte[])s); 2249 2250 // valid 2251 enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2252 auto source = ""; 2253 foreach (h1; hex) 2254 foreach (h2; hex) 2255 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2256 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2257 2258 // invalid 2259 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2260 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2261 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2262 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2263 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2264 } 2265 2266 version (iasm64NotWindows) 2267 { 2268 /** 2269 * Returns: 2270 */ 2271 ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc 2272 { 2273 asm pure nothrow @nogc 2274 { 2275 naked; 2276 movdqu XMM1, [RDI]; 2277 mov RAX, 3; 2278 mov RDX, 16; 2279 mov R8, 0x0d0d0d0d0d0d0d0dL; 2280 movq XMM2, R8; 2281 shufpd XMM2, XMM2, 0; 2282 pcmpeqb XMM2, XMM1; 2283 mov R9, 0x0a0a0a0a0a0a0a0aL; 2284 movq XMM3, R9; 2285 shufpd XMM3, XMM3, 0; 2286 pcmpeqb XMM3, XMM1; 2287 mov R10, 0xe280a8L; 2288 movq XMM4, R10; 2289 pcmpestrm XMM4, XMM1, 0b01001100; 2290 movdqa XMM4, XMM0; 2291 mov R11, 0xe280a9L; 2292 movq XMM5, R11; 2293 pcmpestrm XMM5, XMM1, 0b01001100; 2294 movdqa XMM5, XMM0; 2295 mov RCX, 0x0a0d; 2296 dec RAX; 2297 movq XMM6, RCX; 2298 pcmpestrm XMM6, XMM1, 0b01001100; 2299 movdqa XMM6, XMM0; 2300 movdqa XMM7, XMM6; 2301 pslldq XMM7, 1; 2302 movdqa XMM0, XMM4; 2303 por XMM0, XMM5; 2304 por XMM7, XMM6; 2305 movdqa XMM1, XMM2; 2306 por XMM1, XMM3; 2307 pxor XMM7, XMM1; 2308 por XMM7, XMM0; 2309 por XMM7, XMM6; 2310 pmovmskb RAX, XMM7; 2311 and RAX, 0b0011_1111_1111_1111; 2312 ret; 2313 } 2314 } 2315 2316 /** 2317 * Skips between 0 and 16 bytes that match (or do not match) one of the 2318 * given $(B chars). 2319 */ 2320 void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow 2321 @trusted @nogc if (chars.length <= 8) 2322 { 2323 enum constant = ByteCombine!chars; 2324 enum charsLength = chars.length; 2325 static if (matching) 2326 enum flags = 0b0001_0000; 2327 else 2328 enum flags = 0b0000_0000; 2329 asm pure nothrow @nogc 2330 { 2331 naked; 2332 movdqu XMM1, [RDX]; 2333 mov R10, constant; 2334 movq XMM2, R10; 2335 mov RAX, charsLength; 2336 mov RDX, 16; 2337 pcmpestri XMM2, XMM1, flags; 2338 add [RSI], RCX; 2339 add [RDI], RCX; 2340 ret; 2341 } 2342 } 2343 2344 /** 2345 * Returns: the number of bytes starting at the given location that match 2346 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2347 */ 2348 ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc 2349 { 2350 static assert (chars.length % 2 == 0); 2351 enum constant = ByteCombine!chars; 2352 static if (invert) 2353 enum rangeMatchFlags = 0b0000_0100; 2354 else 2355 enum rangeMatchFlags = 0b0001_0100; 2356 enum charsLength = chars.length; 2357 asm pure nothrow @nogc 2358 { 2359 naked; 2360 movdqu XMM1, [RDI]; 2361 mov R10, constant; 2362 movq XMM2, R10; 2363 mov RAX, charsLength; 2364 mov RDX, 16; 2365 pcmpestri XMM2, XMM1, rangeMatchFlags; 2366 mov RAX, RCX; 2367 ret; 2368 } 2369 } 2370 2371 template ByteCombine(c...) 2372 { 2373 static assert (c.length <= 8); 2374 static if (c.length > 1) 2375 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2376 else 2377 enum ulong ByteCombine = c[0]; 2378 } 2379 }