1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import core.cpuid : sse42; 10 version (D_InlineAsm_X86_64) 11 { 12 version (Windows) {} 13 else version = iasm64NotWindows; 14 } 15 16 /// Operators 17 private enum operators = [ 18 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 19 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 20 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 21 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 22 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 23 ]; 24 25 /// Kewords 26 private enum keywords = [ 27 "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool", 28 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 29 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 30 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 31 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 32 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 33 "immutable", "import", "in", "inout", "int", "interface", "invariant", 34 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 35 "null", "out", "override", "package", "pragma", "private", "protected", 36 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 37 "static", "struct", "super", "switch", "synchronized", "template", "this", 38 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 39 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 40 "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 41 "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters", 42 "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector", 43 "__VENDOR__", "__VERSION__" 44 ]; 45 46 /// Other tokens 47 private enum dynamicTokens = [ 48 "specialTokenSequence", "comment", "identifier", "scriptLine", 49 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 50 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 51 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 52 "dstringLiteral", "stringLiteral", "wstringLiteral" 53 ]; 54 55 private enum pseudoTokenHandlers = [ 56 "\"", "lexStringLiteral", 57 "`", "lexWysiwygString", 58 "//", "lexSlashSlashComment", 59 "/*", "lexSlashStarComment", 60 "/+", "lexSlashPlusComment", 61 ".", "lexDot", 62 "'", "lexCharacterLiteral", 63 "0", "lexNumber", 64 "1", "lexDecimal", 65 "2", "lexDecimal", 66 "3", "lexDecimal", 67 "4", "lexDecimal", 68 "5", "lexDecimal", 69 "6", "lexDecimal", 70 "7", "lexDecimal", 71 "8", "lexDecimal", 72 "9", "lexDecimal", 73 "q\"", "lexDelimitedString", 74 "q{", "lexTokenString", 75 "r\"", "lexWysiwygString", 76 "x\"", "lexHexString", 77 " ", "lexWhitespace", 78 "\t", "lexWhitespace", 79 "\r", "lexWhitespace", 80 "\n", "lexWhitespace", 81 "\v", "lexWhitespace", 82 "\f", "lexWhitespace", 83 "\u2028", "lexLongNewline", 84 "\u2029", "lexLongNewline", 85 "#!", "lexScriptLine", 86 "#line", "lexSpecialTokenSequence" 87 ]; 88 89 /// Token ID type for the D lexer. 90 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 91 92 /** 93 * Function used for converting an IdType to a string. 94 * 95 * Examples: 96 * --- 97 * IdType c = tok!"case"; 98 * assert (str(c) == "case"); 99 * --- 100 */ 101 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 102 103 /** 104 * Template used to refer to D token types. 105 * 106 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 107 * values that can be passed to this template. 108 * Example: 109 * --- 110 * import dparse.lexer; 111 * IdType t = tok!"floatLiteral"; 112 * --- 113 */ 114 public template tok(string token) 115 { 116 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 117 } 118 119 private enum extraFields = q{ 120 string comment; 121 string trailingComment; 122 123 int opCmp(size_t i) const pure nothrow @safe { 124 if (index < i) return -1; 125 if (index > i) return 1; 126 return 0; 127 } 128 129 int opCmp(ref const typeof(this) other) const pure nothrow @safe { 130 return opCmp(other.index); 131 } 132 }; 133 134 /// The token type in the D lexer 135 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 136 137 /** 138 * Configure whitespace handling 139 */ 140 public enum WhitespaceBehavior : ubyte 141 { 142 include = 0b0000_0000, 143 skip = 0b0000_0001, 144 } 145 146 /** 147 * Configure string lexing behavior 148 */ 149 public enum StringBehavior : ubyte 150 { 151 /// Do not include quote characters, process escape sequences 152 compiler = 0b0000_0000, 153 /// Opening quotes, closing quotes, and string suffixes are included in the 154 /// string token 155 includeQuoteChars = 0b0000_0001, 156 /// String escape sequences are not replaced 157 notEscaped = 0b0000_0010, 158 /// Not modified at all. Useful for formatters or highlighters 159 source = includeQuoteChars | notEscaped 160 } 161 162 /** 163 * Lexer configuration struct 164 */ 165 public struct LexerConfig 166 { 167 string fileName; 168 StringBehavior stringBehavior; 169 WhitespaceBehavior whitespaceBehavior; 170 } 171 172 /** 173 * Returns: true if the given ID is for a basic type. 174 */ 175 public bool isBasicType(IdType type) nothrow pure @safe @nogc 176 { 177 switch (type) 178 { 179 case tok!"int": 180 case tok!"uint": 181 case tok!"double": 182 case tok!"idouble": 183 case tok!"float": 184 case tok!"ifloat": 185 case tok!"short": 186 case tok!"ushort": 187 case tok!"long": 188 case tok!"ulong": 189 case tok!"char": 190 case tok!"wchar": 191 case tok!"dchar": 192 case tok!"bool": 193 case tok!"void": 194 case tok!"cent": 195 case tok!"ucent": 196 case tok!"real": 197 case tok!"ireal": 198 case tok!"byte": 199 case tok!"ubyte": 200 case tok!"cdouble": 201 case tok!"cfloat": 202 case tok!"creal": 203 return true; 204 default: 205 return false; 206 } 207 } 208 209 /** 210 * Returns: true if the given ID type is for a number literal. 211 */ 212 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 213 { 214 switch (type) 215 { 216 case tok!"doubleLiteral": 217 case tok!"floatLiteral": 218 case tok!"idoubleLiteral": 219 case tok!"ifloatLiteral": 220 case tok!"intLiteral": 221 case tok!"longLiteral": 222 case tok!"realLiteral": 223 case tok!"irealLiteral": 224 case tok!"uintLiteral": 225 case tok!"ulongLiteral": 226 return true; 227 default: 228 return false; 229 } 230 } 231 232 /** 233 * Returns: true if the given ID type is for an operator. 234 */ 235 public bool isOperator(IdType type) nothrow pure @safe @nogc 236 { 237 switch (type) 238 { 239 case tok!",": 240 case tok!".": 241 case tok!"..": 242 case tok!"...": 243 case tok!"/": 244 case tok!"/=": 245 case tok!"!": 246 case tok!"!<": 247 case tok!"!<=": 248 case tok!"!<>": 249 case tok!"!<>=": 250 case tok!"!=": 251 case tok!"!>": 252 case tok!"!>=": 253 case tok!"$": 254 case tok!"%": 255 case tok!"%=": 256 case tok!"&": 257 case tok!"&&": 258 case tok!"&=": 259 case tok!"(": 260 case tok!")": 261 case tok!"*": 262 case tok!"*=": 263 case tok!"+": 264 case tok!"++": 265 case tok!"+=": 266 case tok!"-": 267 case tok!"--": 268 case tok!"-=": 269 case tok!":": 270 case tok!";": 271 case tok!"<": 272 case tok!"<<": 273 case tok!"<<=": 274 case tok!"<=": 275 case tok!"<>": 276 case tok!"<>=": 277 case tok!"=": 278 case tok!"==": 279 case tok!"=>": 280 case tok!">": 281 case tok!">=": 282 case tok!">>": 283 case tok!">>=": 284 case tok!">>>": 285 case tok!">>>=": 286 case tok!"?": 287 case tok!"@": 288 case tok!"[": 289 case tok!"]": 290 case tok!"^": 291 case tok!"^=": 292 case tok!"^^": 293 case tok!"^^=": 294 case tok!"{": 295 case tok!"|": 296 case tok!"|=": 297 case tok!"||": 298 case tok!"}": 299 case tok!"~": 300 case tok!"~=": 301 return true; 302 default: 303 return false; 304 } 305 } 306 307 /** 308 * Returns: true if the given ID type is for a keyword. 309 */ 310 public bool isKeyword(IdType type) pure nothrow @safe @nogc 311 { 312 switch (type) 313 { 314 case tok!"abstract": 315 case tok!"alias": 316 case tok!"align": 317 case tok!"asm": 318 case tok!"assert": 319 case tok!"auto": 320 case tok!"body": 321 case tok!"break": 322 case tok!"case": 323 case tok!"cast": 324 case tok!"catch": 325 case tok!"class": 326 case tok!"const": 327 case tok!"continue": 328 case tok!"debug": 329 case tok!"default": 330 case tok!"delegate": 331 case tok!"delete": 332 case tok!"deprecated": 333 case tok!"do": 334 case tok!"else": 335 case tok!"enum": 336 case tok!"export": 337 case tok!"extern": 338 case tok!"false": 339 case tok!"final": 340 case tok!"finally": 341 case tok!"for": 342 case tok!"foreach": 343 case tok!"foreach_reverse": 344 case tok!"function": 345 case tok!"goto": 346 case tok!"if": 347 case tok!"immutable": 348 case tok!"import": 349 case tok!"in": 350 case tok!"inout": 351 case tok!"interface": 352 case tok!"invariant": 353 case tok!"is": 354 case tok!"lazy": 355 case tok!"macro": 356 case tok!"mixin": 357 case tok!"module": 358 case tok!"new": 359 case tok!"nothrow": 360 case tok!"null": 361 case tok!"out": 362 case tok!"override": 363 case tok!"package": 364 case tok!"pragma": 365 case tok!"private": 366 case tok!"protected": 367 case tok!"public": 368 case tok!"pure": 369 case tok!"ref": 370 case tok!"return": 371 case tok!"scope": 372 case tok!"shared": 373 case tok!"static": 374 case tok!"struct": 375 case tok!"super": 376 case tok!"switch": 377 case tok!"synchronized": 378 case tok!"template": 379 case tok!"this": 380 case tok!"throw": 381 case tok!"true": 382 case tok!"try": 383 case tok!"typedef": 384 case tok!"typeid": 385 case tok!"typeof": 386 case tok!"union": 387 case tok!"unittest": 388 case tok!"version": 389 case tok!"volatile": 390 case tok!"while": 391 case tok!"with": 392 case tok!"__DATE__": 393 case tok!"__EOF__": 394 case tok!"__FILE__": 395 case tok!"__FUNCTION__": 396 case tok!"__gshared": 397 case tok!"__LINE__": 398 case tok!"__MODULE__": 399 case tok!"__parameters": 400 case tok!"__PRETTY_FUNCTION__": 401 case tok!"__TIME__": 402 case tok!"__TIMESTAMP__": 403 case tok!"__traits": 404 case tok!"__vector": 405 case tok!"__VENDOR__": 406 case tok!"__VERSION__": 407 return true; 408 default: 409 return false; 410 } 411 } 412 413 /** 414 * Returns: true if the given ID type is for a string literal. 415 */ 416 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 417 { 418 switch (type) 419 { 420 case tok!"dstringLiteral": 421 case tok!"stringLiteral": 422 case tok!"wstringLiteral": 423 return true; 424 default: 425 return false; 426 } 427 } 428 429 /** 430 * Returns: true if the given ID type is for a protection attribute. 431 */ 432 public bool isProtection(IdType type) pure nothrow @safe @nogc 433 { 434 switch (type) 435 { 436 case tok!"export": 437 case tok!"package": 438 case tok!"private": 439 case tok!"public": 440 case tok!"protected": 441 return true; 442 default: 443 return false; 444 } 445 } 446 447 /** 448 * Returns: an array of tokens lexed from the given source code to the output range. All 449 * whitespace tokens are skipped and comments are attached to the token nearest 450 * to them. 451 */ 452 const(Token)[] getTokensForParser(ubyte[] sourceCode, LexerConfig config, 453 StringCache* cache) 454 { 455 enum CommentType : ubyte 456 { 457 notDoc, 458 line, 459 block 460 } 461 462 static CommentType commentType(string comment) pure nothrow @safe 463 { 464 if (comment.length < 3) 465 return CommentType.notDoc; 466 if (comment[0 ..3] == "///") 467 return CommentType.line; 468 if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**") 469 return CommentType.block; 470 return CommentType.notDoc; 471 } 472 473 config.whitespaceBehavior = WhitespaceBehavior.skip; 474 475 auto output = appender!(typeof(return))(); 476 auto lexer = DLexer(sourceCode, config, cache); 477 string blockComment; 478 size_t tokenCount; 479 loop: while (!lexer.empty) switch (lexer.front.type) 480 { 481 case tok!"specialTokenSequence": 482 case tok!"whitespace": 483 lexer.popFront(); 484 break; 485 case tok!"comment": 486 final switch (commentType(lexer.front.text)) 487 { 488 case CommentType.block: 489 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 490 { 491 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text; 492 } 493 else 494 { 495 blockComment = lexer.front.text; 496 } 497 lexer.popFront(); 498 break; 499 case CommentType.line: 500 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 501 { 502 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text; 503 } 504 else 505 { 506 string c = lexer.front.text[3 .. $]; // just take the /// off entirely 507 if(blockComment.length == 0) { 508 blockComment = "/++" ~ c ~ "+/"; // just rewrite to this 509 } else { 510 import std..string; 511 auto l = blockComment.lastIndexOf("\n"); 512 if(l != -1) { 513 blockComment = blockComment[0 .. l + 1]; 514 } else { 515 blockComment = blockComment[0 .. $-2]; // just cut off the */ or +/ 516 } 517 if(blockComment[0 .. 3] == "/**") 518 blockComment ~= c ~ "\n*/"; 519 else if(blockComment[0 .. 3] == "/++") 520 blockComment ~= c ~ "\n+/"; 521 else assert(0); 522 523 } 524 } 525 lexer.popFront(); 526 break; 527 case CommentType.notDoc: 528 lexer.popFront(); 529 break; 530 } 531 break; 532 case tok!"__EOF__": 533 break loop; 534 default: 535 Token t = lexer.front; 536 lexer.popFront(); 537 tokenCount++; 538 t.comment = blockComment; 539 blockComment = null; 540 output.put(t); 541 break; 542 } 543 return output.data; 544 } 545 546 /** 547 * The D lexer struct. 548 */ 549 public struct DLexer 550 { 551 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 552 keywords, pseudoTokenHandlers); 553 554 /// 555 @disable this(); 556 557 /** 558 * Params: 559 * range = the bytes that compose the source code that will be lexed. 560 * config = the lexer configuration to use. 561 * cache = the string interning cache for de-duplicating identifiers and 562 * other token text. 563 */ 564 this(ubyte[] range, const LexerConfig config, StringCache* cache, 565 bool haveSSE42 = sse42()) pure nothrow @safe 566 { 567 this.haveSSE42 = haveSSE42; 568 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 569 ? range[3 .. $] : range; 570 this.range = LexerRange(r); 571 this.config = config; 572 this.cache = cache; 573 popFront(); 574 } 575 576 /// 577 public void popFront()() pure nothrow @safe 578 { 579 do 580 _popFront(); 581 while (config.whitespaceBehavior == WhitespaceBehavior.skip 582 && _front.type == tok!"whitespace"); 583 } 584 585 private pure nothrow @safe: 586 587 bool isWhitespace() 588 { 589 switch (range.bytes[range.index]) 590 { 591 case ' ': 592 case '\r': 593 case '\n': 594 case '\t': 595 case '\v': 596 case '\f': 597 return true; 598 case 0xe2: 599 auto peek = range.peek(2); 600 return peek.length == 2 601 && peek[0] == 0x80 602 && (peek[1] == 0xa8 || peek[1] == 0xa9); 603 default: 604 return false; 605 } 606 } 607 608 void popFrontWhitespaceAware() 609 { 610 switch (range.bytes[range.index]) 611 { 612 case '\r': 613 range.popFront(); 614 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 615 { 616 range.popFront(); 617 range.incrementLine(); 618 } 619 else 620 range.incrementLine(); 621 return; 622 case '\n': 623 range.popFront(); 624 range.incrementLine(); 625 return; 626 case 0xe2: 627 auto lookahead = range.peek(3); 628 if (lookahead.length == 3 && lookahead[1] == 0x80 629 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 630 { 631 range.index+=3; 632 range.column+=3; 633 range.incrementLine(); 634 return; 635 } 636 else 637 { 638 range.popFront(); 639 return; 640 } 641 default: 642 range.popFront(); 643 return; 644 } 645 } 646 647 void lexWhitespace(ref Token token) @trusted 648 { 649 mixin (tokenStart); 650 loop: do 651 { 652 version (iasm64NotWindows) 653 { 654 if (haveSSE42 && range.index + 16 < range.bytes.length) 655 { 656 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 657 &range.index, &range.column); 658 } 659 } 660 switch (range.bytes[range.index]) 661 { 662 case '\r': 663 range.popFront(); 664 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 665 { 666 range.popFront(); 667 } 668 range.column = 1; 669 range.line += 1; 670 break; 671 case '\n': 672 range.popFront(); 673 range.column = 1; 674 range.line += 1; 675 break; 676 case ' ': 677 case '\t': 678 case '\v': 679 case '\f': 680 range.popFront(); 681 break; 682 case 0xe2: 683 if (range.index + 2 >= range.bytes.length) 684 break loop; 685 if (range.bytes[range.index + 1] != 0x80) 686 break loop; 687 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 688 { 689 range.index += 3; 690 range.column += 3; 691 range.column = 1; 692 range.line += 1; 693 break; 694 } 695 break loop; 696 default: 697 break loop; 698 } 699 } while (!(range.index >= range.bytes.length)); 700 end: 701 string text = config.whitespaceBehavior == WhitespaceBehavior.include 702 ? cache.intern(range.slice(mark)) : ""; 703 token = Token(tok!"whitespace", text, line, column, index); 704 } 705 706 void lexNumber(ref Token token) 707 { 708 mixin (tokenStart); 709 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 710 { 711 auto ahead = range.bytes[range.index + 1]; 712 switch (ahead) 713 { 714 case 'x': 715 case 'X': 716 range.index += 2; 717 range.column += 2; 718 lexHex(token, mark, line, column, index); 719 return; 720 case 'b': 721 case 'B': 722 range.index += 2; 723 range.column += 2; 724 lexBinary(token, mark, line, column, index); 725 return; 726 default: 727 lexDecimal(token, mark, line, column, index); 728 return; 729 } 730 } 731 else 732 lexDecimal(token, mark, line, column, index); 733 } 734 735 void lexHex(ref Token token) 736 { 737 mixin (tokenStart); 738 lexHex(token, mark, line, column, index); 739 } 740 741 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 742 size_t index) @trusted 743 { 744 IdType type = tok!"intLiteral"; 745 bool foundDot; 746 hexLoop: while (!(range.index >= range.bytes.length)) 747 { 748 switch (range.bytes[range.index]) 749 { 750 case 'a': .. case 'f': 751 case 'A': .. case 'F': 752 case '0': .. case '9': 753 case '_': 754 version (iasm64NotWindows) 755 { 756 if (haveSSE42 && range.index + 16 < range.bytes.length) 757 { 758 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 759 (range.bytes.ptr + range.index); 760 range.column += i; 761 range.index += i; 762 } 763 else 764 range.popFront(); 765 } 766 else 767 range.popFront(); 768 break; 769 case 'u': 770 case 'U': 771 lexIntSuffix(type); 772 break hexLoop; 773 case 'i': 774 if (foundDot) 775 lexFloatSuffix(type); 776 break hexLoop; 777 case 'L': 778 if (foundDot) 779 lexFloatSuffix(type); 780 else 781 lexIntSuffix(type); 782 break hexLoop; 783 case 'p': 784 case 'P': 785 lexExponent(type); 786 break hexLoop; 787 case '.': 788 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 789 break hexLoop; 790 else 791 { 792 // The following bit of silliness tries to tell the 793 // difference between "int dot identifier" and 794 // "double identifier". 795 if ((range.index + 1 < range.bytes.length)) 796 { 797 switch (range.peekAt(1)) 798 { 799 case '0': .. case '9': 800 case 'A': .. case 'F': 801 case 'a': .. case 'f': 802 goto doubleLiteral; 803 default: 804 break hexLoop; 805 } 806 } 807 else 808 { 809 doubleLiteral: 810 range.popFront(); 811 foundDot = true; 812 type = tok!"doubleLiteral"; 813 } 814 } 815 break; 816 default: 817 break hexLoop; 818 } 819 } 820 token = Token(type, cache.intern(range.slice(mark)), line, column, 821 index); 822 } 823 824 void lexBinary(ref Token token) 825 { 826 mixin (tokenStart); 827 return lexBinary(token, mark, line, column, index); 828 } 829 830 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 831 size_t index) @trusted 832 { 833 IdType type = tok!"intLiteral"; 834 binaryLoop: while (!(range.index >= range.bytes.length)) 835 { 836 switch (range.bytes[range.index]) 837 { 838 case '0': 839 case '1': 840 case '_': 841 version (iasm64NotWindows) 842 { 843 if (haveSSE42 && range.index + 16 < range.bytes.length) 844 { 845 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 846 range.bytes.ptr + range.index); 847 range.column += i; 848 range.index += i; 849 } 850 else 851 range.popFront(); 852 } 853 else 854 range.popFront(); 855 break; 856 case 'u': 857 case 'U': 858 case 'L': 859 lexIntSuffix(type); 860 break binaryLoop; 861 default: 862 break binaryLoop; 863 } 864 } 865 token = Token(type, cache.intern(range.slice(mark)), line, column, 866 index); 867 } 868 869 void lexDecimal(ref Token token) 870 { 871 mixin (tokenStart); 872 lexDecimal(token, mark, line, column, index); 873 } 874 875 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 876 size_t index) @trusted 877 { 878 bool foundDot = range.bytes[range.index] == '.'; 879 IdType type = tok!"intLiteral"; 880 if (foundDot) 881 { 882 range.popFront(); 883 type = tok!"doubleLiteral"; 884 } 885 886 decimalLoop: while (!(range.index >= range.bytes.length)) 887 { 888 switch (range.bytes[range.index]) 889 { 890 case '0': .. case '9': 891 case '_': 892 version (iasm64NotWindows) 893 { 894 if (haveSSE42 && range.index + 16 < range.bytes.length) 895 { 896 ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 897 range.column += i; 898 range.index += i; 899 } 900 else 901 range.popFront(); 902 } 903 else 904 range.popFront(); 905 break; 906 case 'u': 907 case 'U': 908 if (!foundDot) 909 lexIntSuffix(type); 910 break decimalLoop; 911 case 'i': 912 lexFloatSuffix(type); 913 break decimalLoop; 914 case 'L': 915 if (foundDot) 916 lexFloatSuffix(type); 917 else 918 lexIntSuffix(type); 919 break decimalLoop; 920 case 'f': 921 case 'F': 922 lexFloatSuffix(type); 923 break decimalLoop; 924 case 'e': 925 case 'E': 926 lexExponent(type); 927 break decimalLoop; 928 case '.': 929 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 930 break decimalLoop; 931 else 932 { 933 // The following bit of silliness tries to tell the 934 // difference between "int dot identifier" and 935 // "double identifier". 936 if ((range.index + 1 < range.bytes.length)) 937 { 938 auto ch = range.peekAt(1); 939 if (ch <= 0x2f 940 || (ch >= '0' && ch <= '9') 941 || (ch >= ':' && ch <= '@') 942 || (ch >= '[' && ch <= '^') 943 || (ch >= '{' && ch <= '~') 944 || ch == '`' || ch == '_') 945 { 946 goto doubleLiteral; 947 } 948 else 949 break decimalLoop; 950 } 951 else 952 { 953 doubleLiteral: 954 range.popFront(); 955 foundDot = true; 956 type = tok!"doubleLiteral"; 957 } 958 } 959 break; 960 default: 961 break decimalLoop; 962 } 963 } 964 token = Token(type, cache.intern(range.slice(mark)), line, column, 965 index); 966 } 967 968 void lexIntSuffix(ref IdType type) 969 { 970 bool secondPass; 971 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 972 { 973 U: 974 if (type == tok!"intLiteral") 975 type = tok!"uintLiteral"; 976 else 977 type = tok!"ulongLiteral"; 978 range.popFront(); 979 if (secondPass) 980 return; 981 if (range.index < range.bytes.length && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 982 goto L; 983 return; 984 } 985 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 986 { 987 L: 988 if (type == tok!"uintLiteral") 989 type = tok!"ulongLiteral"; 990 else 991 type = tok!"longLiteral"; 992 range.popFront(); 993 if (!secondPass && range.index < range.bytes.length && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 994 { 995 secondPass = true; 996 goto U; 997 } 998 return; 999 } 1000 } 1001 1002 void lexFloatSuffix(ref IdType type) pure nothrow @safe 1003 { 1004 switch (range.bytes[range.index]) 1005 { 1006 case 'L': 1007 range.popFront(); 1008 type = tok!"doubleLiteral"; 1009 break; 1010 case 'f': 1011 case 'F': 1012 range.popFront(); 1013 type = tok!"floatLiteral"; 1014 break; 1015 default: 1016 break; 1017 } 1018 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == 'i') 1019 { 1020 warning("Complex number literals are deprecated"); 1021 range.popFront(); 1022 if (type == tok!"floatLiteral") 1023 type = tok!"ifloatLiteral"; 1024 else 1025 type = tok!"idoubleLiteral"; 1026 } 1027 } 1028 1029 void lexExponent(ref IdType type) pure nothrow @safe 1030 { 1031 range.popFront(); 1032 bool foundSign = false; 1033 bool foundDigit = false; 1034 while (!(range.index >= range.bytes.length)) 1035 { 1036 switch (range.bytes[range.index]) 1037 { 1038 case '-': 1039 case '+': 1040 if (foundSign) 1041 { 1042 if (!foundDigit) 1043 error("Expected an exponent"); 1044 return; 1045 } 1046 foundSign = true; 1047 range.popFront(); 1048 break; 1049 case '0': .. case '9': 1050 case '_': 1051 foundDigit = true; 1052 range.popFront(); 1053 break; 1054 case 'L': 1055 case 'f': 1056 case 'F': 1057 case 'i': 1058 lexFloatSuffix(type); 1059 return; 1060 default: 1061 if (!foundDigit) 1062 error("Expected an exponent"); 1063 return; 1064 } 1065 } 1066 } 1067 1068 void lexScriptLine(ref Token token) 1069 { 1070 mixin (tokenStart); 1071 while (!(range.index >= range.bytes.length) && !isNewline) 1072 { 1073 range.popFront(); 1074 } 1075 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1076 line, column, index); 1077 } 1078 1079 void lexSpecialTokenSequence(ref Token token) 1080 { 1081 mixin (tokenStart); 1082 while (!(range.index >= range.bytes.length) && !isNewline) 1083 { 1084 range.popFront(); 1085 } 1086 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1087 line, column, index); 1088 } 1089 1090 void lexSlashStarComment(ref Token token) @trusted 1091 { 1092 mixin (tokenStart); 1093 IdType type = tok!"comment"; 1094 range.popFrontN(2); 1095 while (range.index < range.bytes.length) 1096 { 1097 version (iasm64NotWindows) 1098 { 1099 if (haveSSE42 && range.index + 16 < range.bytes.length) 1100 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1101 &range.index, &range.column); 1102 } 1103 if (range.bytes[range.index] == '*') 1104 { 1105 range.popFront(); 1106 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1107 { 1108 range.popFront(); 1109 break; 1110 } 1111 } 1112 else 1113 popFrontWhitespaceAware(); 1114 } 1115 end: 1116 token = Token(type, cache.intern(range.slice(mark)), line, column, 1117 index); 1118 } 1119 1120 void lexSlashSlashComment(ref Token token) @trusted 1121 { 1122 mixin (tokenStart); 1123 IdType type = tok!"comment"; 1124 range.popFrontN(2); 1125 while (range.index < range.bytes.length) 1126 { 1127 version (iasm64NotWindows) 1128 { 1129 if (haveSSE42 && range.index + 16 < range.bytes.length) 1130 { 1131 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1132 &range.index, &range.column); 1133 } 1134 } 1135 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1136 break; 1137 range.popFront(); 1138 } 1139 end: 1140 token = Token(type, cache.intern(range.slice(mark)), line, column, 1141 index); 1142 } 1143 1144 void lexSlashPlusComment(ref Token token) @trusted 1145 { 1146 mixin (tokenStart); 1147 IdType type = tok!"comment"; 1148 range.index += 2; 1149 range.column += 2; 1150 int depth = 1; 1151 while (depth > 0 && !(range.index >= range.bytes.length)) 1152 { 1153 version (iasm64NotWindows) 1154 { 1155 if (haveSSE42 && range.index + 16 < range.bytes.length) 1156 { 1157 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1158 &range.index, &range.column); 1159 } 1160 } 1161 if (range.bytes[range.index] == '+') 1162 { 1163 range.popFront(); 1164 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1165 { 1166 range.popFront(); 1167 depth--; 1168 } 1169 } 1170 else if (range.bytes[range.index] == '/') 1171 { 1172 range.popFront(); 1173 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1174 { 1175 range.popFront(); 1176 depth++; 1177 } 1178 } 1179 else 1180 popFrontWhitespaceAware(); 1181 } 1182 token = Token(type, cache.intern(range.slice(mark)), line, column, 1183 index); 1184 } 1185 1186 void lexStringLiteral(ref Token token) @trusted 1187 { 1188 mixin (tokenStart); 1189 range.popFront(); 1190 while (true) 1191 { 1192 if ((range.index >= range.bytes.length)) 1193 { 1194 error("Error: unterminated string literal"); 1195 token = Token(tok!""); 1196 return; 1197 } 1198 version (iasm64NotWindows) 1199 { 1200 if (haveSSE42 && range.index + 16 < range.bytes.length) 1201 { 1202 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1203 &range.index, &range.column); 1204 } 1205 } 1206 if (range.bytes[range.index] == '"') 1207 { 1208 range.popFront(); 1209 break; 1210 } 1211 else if (range.bytes[range.index] == '\\') 1212 { 1213 lexEscapeSequence(); 1214 } 1215 else 1216 popFrontWhitespaceAware(); 1217 } 1218 IdType type = tok!"stringLiteral"; 1219 lexStringSuffix(type); 1220 token = Token(type, cache.intern(range.slice(mark)), line, column, 1221 index); 1222 } 1223 1224 void lexWysiwygString(ref Token token) @trusted 1225 { 1226 mixin (tokenStart); 1227 IdType type = tok!"stringLiteral"; 1228 bool backtick = range.bytes[range.index] == '`'; 1229 if (backtick) 1230 { 1231 range.popFront(); 1232 while (true) 1233 { 1234 if ((range.index >= range.bytes.length)) 1235 { 1236 error("Error: unterminated string literal"); 1237 token = Token(tok!""); 1238 return; 1239 } 1240 version (iasm64NotWindows) 1241 { 1242 if (haveSSE42 && range.index + 16 < range.bytes.length) 1243 { 1244 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1245 &range.index, &range.column); 1246 } 1247 } 1248 if (range.bytes[range.index] == '`') 1249 { 1250 range.popFront(); 1251 break; 1252 } 1253 else 1254 popFrontWhitespaceAware(); 1255 } 1256 } 1257 else 1258 { 1259 range.popFront(); 1260 if ((range.index >= range.bytes.length)) 1261 { 1262 error("Error: unterminated string literal"); 1263 token = Token(tok!""); 1264 return; 1265 } 1266 range.popFront(); 1267 while (true) 1268 { 1269 if ((range.index >= range.bytes.length)) 1270 { 1271 error("Error: unterminated string literal"); 1272 token = Token(tok!""); 1273 return; 1274 } 1275 else if (range.bytes[range.index] == '"') 1276 { 1277 range.popFront(); 1278 break; 1279 } 1280 else 1281 popFrontWhitespaceAware(); 1282 } 1283 } 1284 lexStringSuffix(type); 1285 token = Token(type, cache.intern(range.slice(mark)), line, column, 1286 index); 1287 } 1288 1289 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1290 { 1291 if ((range.index >= range.bytes.length)) 1292 { 1293 type = tok!"stringLiteral"; 1294 return 0; 1295 } 1296 else 1297 { 1298 switch (range.bytes[range.index]) 1299 { 1300 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1301 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1302 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1303 default: type = tok!"stringLiteral"; return 0; 1304 } 1305 } 1306 } 1307 1308 void lexDelimitedString(ref Token token) 1309 { 1310 mixin (tokenStart); 1311 range.index += 2; 1312 range.column += 2; 1313 ubyte open; 1314 ubyte close; 1315 switch (range.bytes[range.index]) 1316 { 1317 case '<': 1318 open = '<'; 1319 close = '>'; 1320 range.popFront(); 1321 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1322 break; 1323 case '{': 1324 open = '{'; 1325 close = '}'; 1326 range.popFront(); 1327 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1328 break; 1329 case '[': 1330 open = '['; 1331 close = ']'; 1332 range.popFront(); 1333 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1334 break; 1335 case '(': 1336 open = '('; 1337 close = ')'; 1338 range.popFront(); 1339 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1340 break; 1341 default: 1342 lexHeredocString(token, mark, line, column, index); 1343 break; 1344 } 1345 } 1346 1347 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1348 size_t index, ubyte open, ubyte close) 1349 { 1350 int depth = 1; 1351 while (!(range.index >= range.bytes.length) && depth > 0) 1352 { 1353 if (range.bytes[range.index] == open) 1354 { 1355 depth++; 1356 range.popFront(); 1357 } 1358 else if (range.bytes[range.index] == close) 1359 { 1360 depth--; 1361 range.popFront(); 1362 if (depth <= 0) 1363 { 1364 if (range.bytes[range.index] == '"') 1365 { 1366 range.popFront(); 1367 } 1368 else 1369 { 1370 error("Error: \" expected to end delimited string literal"); 1371 token = Token(tok!""); 1372 return; 1373 } 1374 } 1375 } 1376 else 1377 popFrontWhitespaceAware(); 1378 } 1379 IdType type = tok!"stringLiteral"; 1380 lexStringSuffix(type); 1381 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1382 } 1383 1384 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1385 { 1386 Token ident; 1387 lexIdentifier(ident); 1388 if (isNewline()) 1389 popFrontWhitespaceAware(); 1390 else 1391 error("Newline expected"); 1392 while (!(range.index >= range.bytes.length)) 1393 { 1394 if (isNewline()) 1395 { 1396 popFrontWhitespaceAware(); 1397 if (!range.canPeek(ident.text.length)) 1398 { 1399 error(ident.text ~ " expected"); 1400 break; 1401 } 1402 if (range.peek(ident.text.length - 1) == ident.text) 1403 { 1404 range.popFrontN(ident.text.length); 1405 break; 1406 } 1407 } 1408 else 1409 { 1410 range.popFront(); 1411 } 1412 } 1413 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1414 { 1415 range.popFront(); 1416 } 1417 else 1418 error(`" expected`); 1419 IdType type = tok!"stringLiteral"; 1420 lexStringSuffix(type); 1421 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1422 } 1423 1424 void lexTokenString(ref Token token) 1425 { 1426 mixin (tokenStart); 1427 assert (range.bytes[range.index] == 'q'); 1428 range.popFront(); 1429 assert (range.bytes[range.index] == '{'); 1430 range.popFront(); 1431 auto app = appender!string(); 1432 app.put("q{"); 1433 int depth = 1; 1434 1435 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1436 immutable StringBehavior oldString = config.stringBehavior; 1437 config.whitespaceBehavior = WhitespaceBehavior.include; 1438 config.stringBehavior = StringBehavior.source; 1439 scope (exit) 1440 { 1441 config.whitespaceBehavior = oldWhitespace; 1442 config.stringBehavior = oldString; 1443 } 1444 1445 advance(_front); 1446 while (depth > 0 && !empty) 1447 { 1448 auto t = front(); 1449 if (t.text is null) 1450 app.put(str(t.type)); 1451 else 1452 app.put(t.text); 1453 if (t.type == tok!"}") 1454 { 1455 depth--; 1456 if (depth > 0) 1457 popFront(); 1458 } 1459 else if (t.type == tok!"{") 1460 { 1461 depth++; 1462 popFront(); 1463 } 1464 else 1465 popFront(); 1466 } 1467 IdType type = tok!"stringLiteral"; 1468 auto b = lexStringSuffix(type); 1469 if (b != 0) 1470 app.put(b); 1471 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1472 column, index); 1473 } 1474 1475 void lexHexString(ref Token token) 1476 { 1477 mixin (tokenStart); 1478 range.index += 2; 1479 range.column += 2; 1480 1481 loop: while (true) 1482 { 1483 if ((range.index >= range.bytes.length)) 1484 { 1485 error("Error: unterminated hex string literal"); 1486 token = Token(tok!""); 1487 return; 1488 } 1489 else if (isWhitespace()) 1490 popFrontWhitespaceAware(); 1491 else switch (range.bytes[range.index]) 1492 { 1493 case '0': .. case '9': 1494 case 'A': .. case 'F': 1495 case 'a': .. case 'f': 1496 range.popFront(); 1497 break; 1498 case '"': 1499 range.popFront(); 1500 break loop; 1501 default: 1502 error("Error: invalid character in hex string"); 1503 token = Token(tok!""); 1504 return; 1505 } 1506 } 1507 1508 IdType type = tok!"stringLiteral"; 1509 lexStringSuffix(type); 1510 token = Token(type, cache.intern(range.slice(mark)), line, column, 1511 index); 1512 } 1513 1514 bool lexEscapeSequence() 1515 { 1516 range.popFront(); 1517 if ((range.index >= range.bytes.length)) 1518 { 1519 error("Error: non-terminated character escape sequence."); 1520 return false; 1521 } 1522 switch (range.bytes[range.index]) 1523 { 1524 case '\'': 1525 case '"': 1526 case '?': 1527 case '\\': 1528 case 'a': 1529 case 'b': 1530 case 'f': 1531 case 'n': 1532 case 'r': 1533 case 't': 1534 case 'v': 1535 range.popFront(); 1536 break; 1537 case 'x': 1538 range.popFront(); 1539 foreach (i; 0 .. 2) 1540 { 1541 if ((range.index >= range.bytes.length)) 1542 { 1543 error("Error: 2 hex digits expected."); 1544 return false; 1545 } 1546 switch (range.bytes[range.index]) 1547 { 1548 case '0': .. case '9': 1549 case 'a': .. case 'f': 1550 case 'A': .. case 'F': 1551 range.popFront(); 1552 break; 1553 default: 1554 error("Error: 2 hex digits expected."); 1555 return false; 1556 } 1557 } 1558 break; 1559 case '0': 1560 if (!(range.index + 1 < range.bytes.length) || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1561 { 1562 range.popFront(); 1563 break; 1564 } 1565 goto case; 1566 case '1': .. case '7': 1567 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1568 range.popFront(); 1569 break; 1570 case 'u': 1571 range.popFront(); 1572 foreach (i; 0 .. 4) 1573 { 1574 if ((range.index >= range.bytes.length)) 1575 { 1576 error("Error: at least 4 hex digits expected."); 1577 return false; 1578 } 1579 switch (range.bytes[range.index]) 1580 { 1581 case '0': .. case '9': 1582 case 'a': .. case 'f': 1583 case 'A': .. case 'F': 1584 range.popFront(); 1585 break; 1586 default: 1587 error("Error: at least 4 hex digits expected."); 1588 return false; 1589 } 1590 } 1591 break; 1592 case 'U': 1593 range.popFront(); 1594 foreach (i; 0 .. 8) 1595 { 1596 if ((range.index >= range.bytes.length)) 1597 { 1598 error("Error: at least 8 hex digits expected."); 1599 return false; 1600 } 1601 switch (range.bytes[range.index]) 1602 { 1603 case '0': .. case '9': 1604 case 'a': .. case 'f': 1605 case 'A': .. case 'F': 1606 range.popFront(); 1607 break; 1608 default: 1609 error("Error: at least 8 hex digits expected."); 1610 return false; 1611 } 1612 } 1613 break; 1614 default: 1615 while (true) 1616 { 1617 if ((range.index >= range.bytes.length)) 1618 { 1619 error("Error: non-terminated character escape sequence."); 1620 return false; 1621 } 1622 if (range.bytes[range.index] == ';') 1623 { 1624 range.popFront(); 1625 break; 1626 } 1627 else 1628 { 1629 range.popFront(); 1630 } 1631 } 1632 } 1633 return true; 1634 } 1635 1636 void lexCharacterLiteral(ref Token token) 1637 { 1638 mixin (tokenStart); 1639 range.popFront(); 1640 if (range.bytes[range.index] == '\\') 1641 { 1642 lexEscapeSequence(); 1643 goto close; 1644 } 1645 else if (range.bytes[range.index] == '\'') 1646 { 1647 range.popFront(); 1648 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1649 line, column, index); 1650 } 1651 else if (range.bytes[range.index] & 0x80) 1652 { 1653 while (range.bytes[range.index] & 0x80) 1654 { 1655 range.popFront(); 1656 } 1657 goto close; 1658 } 1659 else 1660 { 1661 popFrontWhitespaceAware(); 1662 goto close; 1663 } 1664 close: 1665 if (range.bytes[range.index] == '\'') 1666 { 1667 range.popFront(); 1668 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1669 line, column, index); 1670 } 1671 else 1672 { 1673 error("Error: Expected ' to end character literal"); 1674 token = Token(tok!""); 1675 } 1676 } 1677 1678 void lexIdentifier(ref Token token) @trusted 1679 { 1680 mixin (tokenStart); 1681 if (isSeparating(0)) 1682 { 1683 error("Invalid identifier"); 1684 range.popFront(); 1685 } 1686 while (true) 1687 { 1688 version (iasm64NotWindows) 1689 { 1690 if (haveSSE42 && range.index + 16 < range.bytes.length) 1691 { 1692 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1693 (range.bytes.ptr + range.index); 1694 range.column += i; 1695 range.index += i; 1696 } 1697 } 1698 if (isSeparating(0)) 1699 break; 1700 else 1701 range.popFront(); 1702 } 1703 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1704 column, index); 1705 } 1706 1707 void lexDot(ref Token token) 1708 { 1709 mixin (tokenStart); 1710 if (!(range.index + 1 < range.bytes.length)) 1711 { 1712 range.popFront(); 1713 token = Token(tok!".", null, line, column, index); 1714 return; 1715 } 1716 switch (range.peekAt(1)) 1717 { 1718 case '0': .. case '9': 1719 lexNumber(token); 1720 return; 1721 case '.': 1722 range.popFront(); 1723 range.popFront(); 1724 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1725 { 1726 range.popFront(); 1727 token = Token(tok!"...", null, line, column, index); 1728 } 1729 else 1730 token = Token(tok!"..", null, line, column, index); 1731 return; 1732 default: 1733 range.popFront(); 1734 token = Token(tok!".", null, line, column, index); 1735 return; 1736 } 1737 } 1738 1739 void lexLongNewline(ref Token token) @nogc 1740 { 1741 mixin (tokenStart); 1742 range.popFront(); 1743 range.popFront(); 1744 range.popFront(); 1745 range.incrementLine(); 1746 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1747 ? cache.intern(range.slice(mark)) : ""; 1748 token = Token(tok!"whitespace", text, line, 1749 column, index); 1750 } 1751 1752 bool isNewline() @nogc 1753 { 1754 if (range.bytes[range.index] == '\n') return true; 1755 if (range.bytes[range.index] == '\r') return true; 1756 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1757 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1758 } 1759 1760 bool isSeparating(size_t offset) @nogc 1761 { 1762 enum : ubyte 1763 { 1764 n, y, m // no, yes, maybe 1765 } 1766 1767 if (range.index + offset >= range.bytes.length) 1768 return true; 1769 auto c = range.bytes[range.index + offset]; 1770 static immutable ubyte[256] LOOKUP_TABLE = [ 1771 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1772 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1773 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1774 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1775 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1776 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1777 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1778 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1779 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1780 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1781 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1782 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1783 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1784 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1785 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1786 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1787 ]; 1788 immutable ubyte result = LOOKUP_TABLE[c]; 1789 if (result == n) 1790 return false; 1791 if (result == y) 1792 return true; 1793 if (result == m) 1794 { 1795 auto r = range; 1796 range.popFrontN(offset); 1797 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1798 || r.peek(2) == "\u2029")); 1799 } 1800 assert (false); 1801 } 1802 1803 1804 1805 enum tokenStart = q{ 1806 size_t index = range.index; 1807 size_t column = range.column; 1808 size_t line = range.line; 1809 auto mark = range.mark(); 1810 }; 1811 1812 void error(string message) 1813 { 1814 messages ~= Message(range.line, range.column, message, true); 1815 } 1816 1817 void warning(string message) 1818 { 1819 messages ~= Message(range.line, range.column, message, false); 1820 assert (messages.length > 0); 1821 } 1822 1823 static struct Message 1824 { 1825 size_t line; 1826 size_t column; 1827 string message; 1828 bool isError; 1829 } 1830 1831 Message[] messages; 1832 StringCache* cache; 1833 LexerConfig config; 1834 bool haveSSE42; 1835 } 1836 1837 /** 1838 * Creates a token range from the given source code. Creates a default lexer 1839 * configuration and a GC-managed string cache. 1840 */ 1841 public auto byToken(ubyte[] range) 1842 { 1843 LexerConfig config; 1844 StringCache* cache = new StringCache(StringCache.defaultBucketCount); 1845 return DLexer(range, config, cache); 1846 } 1847 1848 /** 1849 * Creates a token range from the given source code. Uses the given string 1850 * cache. 1851 */ 1852 public auto byToken(ubyte[] range, StringCache* cache) 1853 { 1854 LexerConfig config; 1855 return DLexer(range, config, cache); 1856 } 1857 1858 /** 1859 * Creates a token range from the given source code. Uses the provided lexer 1860 * configuration and string cache. 1861 */ 1862 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache) 1863 { 1864 return DLexer(range, config, cache); 1865 } 1866 1867 /** 1868 * Removes "decoration" such as leading whitespace, leading + and * characters, 1869 * and places the result into the given output range 1870 */ 1871 public void unDecorateComment(T)(string comment, auto ref T outputRange) 1872 if (isOutputRange!(T, string)) 1873 in 1874 { 1875 assert (comment.length >= 3); 1876 } 1877 body 1878 { 1879 switch (comment[0 .. 3]) 1880 { 1881 case "///": 1882 size_t i = 3; 1883 if (i < comment.length) 1884 { 1885 again: 1886 while (i < comment.length && (comment[i] == ' ' || comment[i] == '\t')) 1887 i++; 1888 size_t j = i + 1; 1889 while (j < comment.length) 1890 { 1891 if (comment[j] == '\r') 1892 j++; 1893 if (j >= comment.length) 1894 break; 1895 if (comment[j] == '\n') 1896 { 1897 outputRange.put(comment[i .. j]); 1898 j++; 1899 while (j < comment.length && comment[j] == '/') 1900 j++; 1901 outputRange.put('\n'); 1902 i = j; 1903 goto again; 1904 } 1905 j++; 1906 } 1907 if (i < comment.length && j <= comment.length) 1908 outputRange.put(comment[i .. j]); 1909 } 1910 break; 1911 case "/++": 1912 case "/**": 1913 size_t i = 3; 1914 immutable char c = comment[1]; 1915 // Skip leading * and + characters 1916 while (comment[i] == c) i++; 1917 // Skip trailing * and + characters 1918 size_t j = comment.length - 2; 1919 while (j > i && comment[j] == c) 1920 j--; 1921 while (j > i && (comment[j] == ' ' || comment[j] == '\t')) 1922 j--; 1923 j++; 1924 size_t k = i; 1925 while (k < j) 1926 { 1927 if (comment[k] == '\n') 1928 { 1929 k++; 1930 break; 1931 } 1932 k++; 1933 } 1934 outputRange.put(comment[i .. k]); 1935 i = k; 1936 if (comment[i] == '\r') i++; 1937 if (comment[i] == '\n') i++; 1938 while (comment[i] == ' ' || comment[i] == '\t') i++; 1939 immutable bool skipBeginningChar = comment[i] == c; 1940 if (skipBeginningChar) 1941 i++; 1942 size_t whitespaceToSkip; 1943 while (comment[i] == ' ' || comment[i] == '\t') 1944 { 1945 whitespaceToSkip++; 1946 i++; 1947 } 1948 size_t l = i; 1949 while (i < j) 1950 { 1951 if (comment[i++] == '\n') 1952 break; 1953 } 1954 outputRange.put(comment[l .. i]); 1955 while (true) 1956 { 1957 if (skipBeginningChar) 1958 { 1959 while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++; 1960 if (i < j && comment[i] == c) i++; 1961 } 1962 for (size_t s = 0; (i < j) && (s < whitespaceToSkip) 1963 && (comment[i] == ' ' || comment[i] == '\t');) 1964 { 1965 s++; 1966 i++; 1967 } 1968 k = i; 1969 inner: while (k < j) 1970 { 1971 if (comment[k] == '\n') 1972 { 1973 k++; 1974 break inner; 1975 } 1976 k++; 1977 } 1978 outputRange.put(comment[i .. k]); 1979 i = k; 1980 if (i >= j) 1981 break; 1982 } 1983 break; 1984 default: 1985 outputRange.put(comment); 1986 break; 1987 } 1988 } 1989 1990 1991 /** 1992 * The string cache is used for string interning. 1993 * 1994 * It will only store a single copy of any string that it is asked to hold. 1995 * Interned strings can be compared for equality by comparing their $(B .ptr) 1996 * field. 1997 * 1998 * Default and postbilt constructors are disabled. When a StringCache goes out 1999 * of scope, the memory held by it is freed. 2000 * 2001 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 2002 */ 2003 struct StringCache 2004 { 2005 public pure nothrow @nogc: 2006 2007 @disable this(); 2008 @disable this(this); 2009 2010 /** 2011 * Params: bucketCount = the initial number of buckets. Must be a 2012 * power of two 2013 */ 2014 this(size_t bucketCount) nothrow @trusted @nogc 2015 in 2016 { 2017 import core.bitop : popcnt; 2018 static if (size_t.sizeof == 8) 2019 { 2020 immutable low = popcnt(cast(uint) bucketCount); 2021 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 2022 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 2023 } 2024 else 2025 { 2026 static assert (size_t.sizeof == 4); 2027 assert (popcnt(cast(uint) bucketCount) == 1); 2028 } 2029 } 2030 body 2031 { 2032 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 2033 } 2034 2035 version(none) 2036 ~this() 2037 { 2038 Block* current = rootBlock; 2039 while (current !is null) 2040 { 2041 Block* prev = current; 2042 current = current.next; 2043 free(cast(void*) prev); 2044 } 2045 foreach (nodePointer; buckets) 2046 { 2047 Node* currentNode = nodePointer; 2048 while (currentNode !is null) 2049 { 2050 if (currentNode.mallocated) 2051 free(currentNode.str.ptr); 2052 Node* prev = currentNode; 2053 currentNode = currentNode.next; 2054 free(prev); 2055 } 2056 } 2057 rootBlock = null; 2058 free(buckets.ptr); 2059 buckets = null; 2060 } 2061 2062 /** 2063 * Caches a string. 2064 */ 2065 string intern(const(ubyte)[] str) @safe 2066 { 2067 if (str is null || str.length == 0) 2068 return ""; 2069 return _intern(str); 2070 } 2071 2072 /** 2073 * ditto 2074 */ 2075 string intern(string str) @trusted 2076 { 2077 return intern(cast(ubyte[]) str); 2078 } 2079 2080 /** 2081 * The default bucket count for the string cache. 2082 */ 2083 static enum defaultBucketCount = 4096; 2084 2085 private: 2086 2087 string _intern(const(ubyte)[] bytes) @trusted 2088 { 2089 immutable uint hash = hashBytes(bytes); 2090 immutable size_t index = hash & (buckets.length - 1); 2091 Node* s = find(bytes, hash); 2092 if (s !is null) 2093 return cast(string) s.str; 2094 ubyte[] mem = void; 2095 bool mallocated = bytes.length > BIG_STRING; 2096 if (mallocated) 2097 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2098 else 2099 mem = allocate(bytes.length); 2100 mem[] = bytes[]; 2101 Node* node = cast(Node*) malloc(Node.sizeof); 2102 node.str = mem; 2103 node.hash = hash; 2104 node.next = buckets[index]; 2105 node.mallocated = mallocated; 2106 buckets[index] = node; 2107 return cast(string) mem; 2108 } 2109 2110 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2111 { 2112 import std.algorithm : equal; 2113 immutable size_t index = hash & (buckets.length - 1); 2114 Node* node = buckets[index]; 2115 while (node !is null) 2116 { 2117 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2118 return node; 2119 node = node.next; 2120 } 2121 return node; 2122 } 2123 2124 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2125 in 2126 { 2127 assert (data !is null); 2128 assert (data.length > 0); 2129 } 2130 body 2131 { 2132 immutable uint m = 0x5bd1e995; 2133 immutable int r = 24; 2134 uint h = cast(uint) data.length; 2135 while (data.length >= 4) 2136 { 2137 uint k = (cast(ubyte) data[3]) << 24 2138 | (cast(ubyte) data[2]) << 16 2139 | (cast(ubyte) data[1]) << 8 2140 | (cast(ubyte) data[0]); 2141 k *= m; 2142 k ^= k >> r; 2143 k *= m; 2144 h *= m; 2145 h ^= k; 2146 data = data[4 .. $]; 2147 } 2148 switch (data.length & 3) 2149 { 2150 case 3: 2151 h ^= data[2] << 16; 2152 goto case; 2153 case 2: 2154 h ^= data[1] << 8; 2155 goto case; 2156 case 1: 2157 h ^= data[0]; 2158 h *= m; 2159 break; 2160 default: 2161 break; 2162 } 2163 h ^= h >> 13; 2164 h *= m; 2165 h ^= h >> 15; 2166 return h; 2167 } 2168 2169 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2170 in 2171 { 2172 assert (numBytes != 0); 2173 } 2174 out (result) 2175 { 2176 assert (result.length == numBytes); 2177 } 2178 body 2179 { 2180 Block* r = rootBlock; 2181 size_t i = 0; 2182 while (i <= 3 && r !is null) 2183 { 2184 immutable size_t available = r.bytes.length; 2185 immutable size_t oldUsed = r.used; 2186 immutable size_t newUsed = oldUsed + numBytes; 2187 if (newUsed <= available) 2188 { 2189 r.used = newUsed; 2190 return r.bytes[oldUsed .. newUsed]; 2191 } 2192 i++; 2193 r = r.next; 2194 } 2195 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2196 b.used = numBytes; 2197 b.next = rootBlock; 2198 rootBlock = b; 2199 return b.bytes[0 .. numBytes]; 2200 } 2201 2202 static struct Node 2203 { 2204 ubyte[] str = void; 2205 Node* next = void; 2206 uint hash = void; 2207 bool mallocated = void; 2208 } 2209 2210 static struct Block 2211 { 2212 Block* next; 2213 size_t used; 2214 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2215 ubyte[BLOCK_CAPACITY] bytes; 2216 } 2217 2218 static assert (BLOCK_SIZE == Block.sizeof); 2219 2220 enum BLOCK_SIZE = 1024 * 16; 2221 2222 // If a string would take up more than 1/4 of a block, allocate it outside 2223 // of the block. 2224 enum BIG_STRING = BLOCK_SIZE / 4; 2225 2226 Node*[] buckets; 2227 Block* rootBlock; 2228 } 2229 2230 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2231 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2232 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2233 2234 unittest 2235 { 2236 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2237 auto tokens = getTokensForParser(source, LexerConfig(), 2238 new StringCache(StringCache.defaultBucketCount)); 2239 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2240 tok!"identifier", tok!";"])); 2241 } 2242 2243 /// Test \x char sequence 2244 unittest 2245 { 2246 auto toks = (string s) => byToken(cast(ubyte[])s); 2247 2248 // valid 2249 enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2250 auto source = ""; 2251 foreach (h1; hex) 2252 foreach (h2; hex) 2253 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2254 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2255 2256 // invalid 2257 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2258 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2259 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2260 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2261 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2262 } 2263 2264 version (iasm64NotWindows) 2265 { 2266 /** 2267 * Returns: 2268 */ 2269 ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc 2270 { 2271 asm pure nothrow @nogc 2272 { 2273 naked; 2274 movdqu XMM1, [RDI]; 2275 mov RAX, 3; 2276 mov RDX, 16; 2277 mov R8, 0x0d0d0d0d0d0d0d0dL; 2278 movq XMM2, R8; 2279 shufpd XMM2, XMM2, 0; 2280 pcmpeqb XMM2, XMM1; 2281 mov R9, 0x0a0a0a0a0a0a0a0aL; 2282 movq XMM3, R9; 2283 shufpd XMM3, XMM3, 0; 2284 pcmpeqb XMM3, XMM1; 2285 mov R10, 0xe280a8L; 2286 movq XMM4, R10; 2287 pcmpestrm XMM4, XMM1, 0b01001100; 2288 movdqa XMM4, XMM0; 2289 mov R11, 0xe280a9L; 2290 movq XMM5, R11; 2291 pcmpestrm XMM5, XMM1, 0b01001100; 2292 movdqa XMM5, XMM0; 2293 mov RCX, 0x0a0d; 2294 dec RAX; 2295 movq XMM6, RCX; 2296 pcmpestrm XMM6, XMM1, 0b01001100; 2297 movdqa XMM6, XMM0; 2298 movdqa XMM7, XMM6; 2299 pslldq XMM7, 1; 2300 movdqa XMM0, XMM4; 2301 por XMM0, XMM5; 2302 por XMM7, XMM6; 2303 movdqa XMM1, XMM2; 2304 por XMM1, XMM3; 2305 pxor XMM7, XMM1; 2306 por XMM7, XMM0; 2307 por XMM7, XMM6; 2308 pmovmskb RAX, XMM7; 2309 and RAX, 0b0011_1111_1111_1111; 2310 ret; 2311 } 2312 } 2313 2314 /** 2315 * Skips between 0 and 16 bytes that match (or do not match) one of the 2316 * given $(B chars). 2317 */ 2318 void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow 2319 @trusted @nogc if (chars.length <= 8) 2320 { 2321 enum constant = ByteCombine!chars; 2322 enum charsLength = chars.length; 2323 static if (matching) 2324 enum flags = 0b0001_0000; 2325 else 2326 enum flags = 0b0000_0000; 2327 asm pure nothrow @nogc 2328 { 2329 naked; 2330 movdqu XMM1, [RDX]; 2331 mov R10, constant; 2332 movq XMM2, R10; 2333 mov RAX, charsLength; 2334 mov RDX, 16; 2335 pcmpestri XMM2, XMM1, flags; 2336 add [RSI], RCX; 2337 add [RDI], RCX; 2338 ret; 2339 } 2340 } 2341 2342 /** 2343 * Returns: the number of bytes starting at the given location that match 2344 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2345 */ 2346 ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc 2347 { 2348 static assert (chars.length % 2 == 0); 2349 enum constant = ByteCombine!chars; 2350 static if (invert) 2351 enum rangeMatchFlags = 0b0000_0100; 2352 else 2353 enum rangeMatchFlags = 0b0001_0100; 2354 enum charsLength = chars.length; 2355 asm pure nothrow @nogc 2356 { 2357 naked; 2358 movdqu XMM1, [RDI]; 2359 mov R10, constant; 2360 movq XMM2, R10; 2361 mov RAX, charsLength; 2362 mov RDX, 16; 2363 pcmpestri XMM2, XMM1, rangeMatchFlags; 2364 mov RAX, RCX; 2365 ret; 2366 } 2367 } 2368 2369 template ByteCombine(c...) 2370 { 2371 static assert (c.length <= 8); 2372 static if (c.length > 1) 2373 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2374 else 2375 enum ulong ByteCombine = c[0]; 2376 } 2377 }