1 // Written in the D programming language 2 3 /** 4 * $(H2 Summary) 5 * This module contains a range-based compile-time _lexer generator. 6 * 7 * $(H2 Overview) 8 * The _lexer generator consists of a template mixin, $(LREF Lexer), along with 9 * several helper templates for generating such things as token identifiers. 10 * 11 * To write a _lexer using this API: 12 * $(OL 13 * $(LI Create the string array constants for your language. 14 * $(UL 15 * $(LI $(LINK2 #.staticTokens, staticTokens)) 16 * $(LI $(LINK2 #.dynamicTokens, dynamicTokens)) 17 * $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)) 18 * $(LI $(LINK2 #.tokenHandlers, tokenHandlers)) 19 * )) 20 * $(LI Create aliases for the various token and token identifier types 21 * specific to your language. 22 * $(UL 23 * $(LI $(LREF TokenIdType)) 24 * $(LI $(LREF tokenStringRepresentation)) 25 * $(LI $(LREF TokenStructure)) 26 * $(LI $(LREF TokenId)) 27 * )) 28 * $(LI Create a struct that mixes in the Lexer template mixin and 29 * implements the necessary functions. 30 * $(UL 31 * $(LI $(LREF Lexer)) 32 * )) 33 * ) 34 * Examples: 35 * $(UL 36 * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).) 37 * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).) 38 * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).) 39 * ) 40 * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions) 41 * $(DL 42 * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction) 43 * $(DD A function that serves as the default token lexing function. For most 44 * languages this will be the identifier lexing function.)) 45 * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction)) 46 * $(DD A function that is able to determine if an identifier/keyword has come 47 * to an end. This function must return bool and take a single size_t 48 * argument representing the number of bytes to skip over before looking for 49 * a separating character.) 50 * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens)) 51 * $(DD A listing of the tokens whose exact value never changes and which cannot 52 * possibly be a token handled by the default token lexing function. The 53 * most common example of this kind of token is an operator such as 54 * $(D_STRING "*"), or $(D_STRING "-") in a programming language.) 55 * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens)) 56 * $(DD A listing of tokens whose value is variable, such as whitespace, 57 * identifiers, number literals, and string literals.) 58 * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens)) 59 * $(DD A listing of tokens that could posibly be one of the tokens handled by 60 * the default token handling function. An common example of this is 61 * a keyword such as $(D_STRING "for"), which looks like the beginning of 62 * the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is 63 * called to determine if the character after the $(D_STRING 'r') separates 64 * the identifier, indicating that the token is $(D_STRING "for"), or if 65 * lexing should be turned over to the $(B defaultTokenFunction).) 66 * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers)) 67 * $(DD A mapping of prefixes to custom token handling function names. The 68 * generated _lexer will search for the even-index elements of this array, 69 * and then call the function whose name is the element immedately after the 70 * even-indexed element. This is used for lexing complex tokens whose prefix 71 * is fixed.) 72 * ) 73 * 74 * Here are some example constants for a simple calculator _lexer: 75 * --- 76 * // There are a near infinite number of valid number literals, so numbers are 77 * // dynamic tokens. 78 * enum string[] dynamicTokens = ["numberLiteral", "whitespace"]; 79 * 80 * // The operators are always the same, and cannot start a numberLiteral, so 81 * // they are staticTokens 82 * enum string[] staticTokens = ["-", "+", "*", "/"]; 83 * 84 * // In this simple example there are no keywords or other tokens that could 85 * // look like dynamic tokens, so this is blank. 86 * enum string[] possibleDefaultTokens = []; 87 * 88 * // If any whitespace character or digit is encountered, pass lexing over to 89 * // our custom handler functions. These will be demonstrated in an example 90 * // later on. 91 * enum string[] tokenHandlers = [ 92 * "0", "lexNumber", 93 * "1", "lexNumber", 94 * "2", "lexNumber", 95 * "3", "lexNumber", 96 * "4", "lexNumber", 97 * "5", "lexNumber", 98 * "6", "lexNumber", 99 * "7", "lexNumber", 100 * "8", "lexNumber", 101 * "9", "lexNumber", 102 * " ", "lexWhitespace", 103 * "\n", "lexWhitespace", 104 * "\t", "lexWhitespace", 105 * "\r", "lexWhitespace" 106 * ]; 107 * --- 108 * 109 * Copyright: Brian Schott 2013 110 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) 111 * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu 112 * Source: $(PHOBOSSRC std/experimental/_lexer.d) 113 */ 114 115 module std.experimental.lexer; 116 117 /** 118 * Template for determining the type used for a token type. 119 * 120 * Selects the smallest unsigned integral type that is able to hold the value 121 * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length. 122 * For example if there are 20 static tokens, 30 dynamic tokens, 123 * and 10 possible default tokens, this template will alias itself to ubyte, 124 * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max. 125 * Examples: 126 * --- 127 * // In our calculator example this means that IdType is an alias for ubyte. 128 * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens); 129 * --- 130 */ 131 template TokenIdType(alias staticTokens, alias dynamicTokens, 132 alias possibleDefaultTokens) 133 { 134 immutable tokenCount = staticTokens.length + dynamicTokens.length 135 + possibleDefaultTokens.length + 1; 136 static if (tokenCount <= ubyte.max) 137 alias TokenIdType = ubyte; 138 else static if (tokenCount <= ushort.max) 139 alias TokenIdType = ushort; 140 else static if (tokenCount <= uint.max) 141 alias TokenIdType = uint; 142 else 143 static assert (false, "The number of tokens must be less than uint.max"); 144 } 145 146 /** 147 * Looks up the string representation of the given token type. 148 * 149 * This is the opposite of the function of the TokenId template. 150 * Params: type = the token type identifier 151 * Examples: 152 * --- 153 * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens); 154 * assert (str(tok!"*") == "*"); 155 * --- 156 * See_also: $(LREF TokenId) 157 */ 158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, 159 alias possibleDefaultTokens)(IdType type) @property 160 { 161 enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 162 163 if (type == 0) 164 return "!ERROR!"; 165 else if (type < tokens.length + 1) 166 return tokens[type - 1]; 167 else 168 return null; 169 } 170 171 unittest 172 { 173 alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]); 174 enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token); 175 alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]); 176 177 static assert (str(tok!"foo") == "foo"); 178 static assert (str(tok!"bar") == "bar"); 179 static assert (str(tok!"doo") == "doo"); 180 } 181 182 /** 183 * Generates the token type identifier for the given symbol. 184 * 185 * There are two special cases: 186 * $(UL 187 * $(LI If symbol is $(D_STRING ""), then the token identifier will be 0) 188 * $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum 189 * valid token type identifier) 190 * ) 191 * In all cases this template will alias itself to a constant of type IdType. 192 * This template will fail at compile time if $(D_PARAM symbol) is not one of 193 * the staticTokens, dynamicTokens, or possibleDefaultTokens. 194 * Examples: 195 * --- 196 * template tok(string symbol) 197 * { 198 * alias tok = TokenId!(IdType, staticTokens, dynamicTokens, 199 * possibleDefaultTokens, symbol); 200 * } 201 * // num and plus are of type ubyte. 202 * IdType plus = tok!"+"; 203 * IdType num = tok!"numberLiteral"; 204 * --- 205 */ 206 template TokenId(IdType, alias staticTokens, alias dynamicTokens, 207 alias possibleDefaultTokens, string symbol) 208 { 209 enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 210 211 import std.algorithm; 212 static if (symbol == "") 213 { 214 enum id = 0; 215 alias TokenId = id; 216 } 217 else static if (symbol == "\0") 218 { 219 enum id = 1 + tokens.length; 220 alias TokenId = id; 221 } 222 else 223 { 224 enum i = tokens.countUntil(symbol); 225 static if (i != -1) 226 { 227 enum id = i + 1; 228 static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol); 229 alias TokenId = id; 230 } 231 else 232 static assert (0, "Invalid token: " ~ symbol); 233 } 234 } 235 236 /** 237 * The token that is returned by the lexer. 238 * Params: 239 * IdType = The D type of the "type" token type field. 240 * extraFields = A string containing D code for any extra fields that should 241 * be included in the token structure body. This string is passed 242 * directly to a mixin statement. 243 * Examples: 244 * --- 245 * // No extra struct fields are desired in this example, so leave it blank. 246 * alias Token = TokenStructure!(IdType, ""); 247 * Token minusToken = Token(tok!"-"); 248 * --- 249 */ 250 struct TokenStructure(IdType, string extraFields = "") 251 { 252 public pure nothrow @safe @nogc: 253 254 bool opEquals(ref const typeof(this) other) const 255 { 256 return this.type == other.type && this.text == other.text; 257 } 258 259 /** 260 * Returs: true if the token has the given type, false otherwise. 261 */ 262 bool opEquals(IdType type) const 263 { 264 return this.type == type; 265 } 266 267 /** 268 * Constructs a token from a token type. 269 * Params: type = the token type 270 */ 271 this(IdType type) 272 { 273 this.type = type; 274 } 275 276 /** 277 * Constructs a token. 278 * Params: 279 * type = the token type 280 * text = the text of the token, which may be null 281 * line = the line number at which this token occurs 282 * column = the column number at which this token occurs 283 * index = the byte offset from the beginning of the input at which this 284 * token occurs 285 */ 286 this(IdType type, string text, size_t line, size_t column, size_t index) 287 { 288 this.text = text; 289 this.line = line; 290 this.column = column; 291 this.type = type; 292 this.index = index; 293 } 294 295 /** 296 * The _text of the token. 297 */ 298 string text; 299 300 /** 301 * The _line number at which this token occurs. 302 */ 303 size_t line; 304 305 /** 306 * The _column number at which this token occurs. This is measured in bytes 307 * and may not be correct when tab characters are involved. 308 */ 309 size_t column; 310 311 /** 312 * The byte offset from the beginning of the input at which this token 313 * occurs. 314 */ 315 size_t index; 316 317 /** 318 * The token type. 319 */ 320 IdType type; 321 322 mixin (extraFields); 323 } 324 325 /** 326 * The implementation of the _lexer is contained within this mixin template. 327 * 328 * To use it, this template should be mixed in to a struct that represents the 329 * _lexer for your language. This struct should implement the following methods: 330 * $(UL 331 * $(LI popFront, which should call this mixin's _popFront() and 332 * additionally perform any token filtering or shuffling you deem 333 * necessary. For example, you can implement popFront to skip comment or 334 * tokens.) 335 * $(LI A function that serves as the default token lexing function. For 336 * most languages this will be the identifier lexing function. This 337 * should then be passed to the $(LREF Lexer) template mixin as the 338 * $(LINK2 #.defaultTokenFunction defaultTokenFunction) template 339 * parameter.) 340 * $(LI A function that is able to determine if an identifier/keyword has 341 * come to an end. This function must return $(D_KEYWORD bool) and take 342 * a single $(D_KEYWORD size_t) argument representing the number of 343 * bytes to skip over before looking for a separating character.) 344 * $(LI Any functions referred to in the tokenHandlers template paramater. 345 * These functions must be marked $(D_KEYWORD pure nothrow), take no 346 * arguments, and return a token) 347 * $(LI A constructor that initializes the range field as well as calls 348 * popFront() exactly once (to initialize the _front field).) 349 * ) 350 * Params: 351 * Token = $(LREF TokenStructure) 352 * defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction) 353 * tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction) 354 * staticTokens = $(LINK2 #.staticTokens, staticTokens) 355 * dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens) 356 * possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens) 357 * tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers) 358 * Examples: 359 * --- 360 * struct CalculatorLexer 361 * { 362 * mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating, 363 * staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers); 364 * 365 * this (ubyte[] bytes) 366 * { 367 * this.range = LexerRange(bytes); 368 * popFront(); 369 * } 370 * 371 * void popFront() pure 372 * { 373 * _popFront(); 374 * } 375 * 376 * Token lexNumber() pure nothrow @safe 377 * { 378 * // implementation goes here 379 * } 380 * 381 * Token lexWhitespace() pure nothrow @safe 382 * { 383 * // implementation goes here 384 * } 385 * 386 * Token defaultTokenFunction() pure nothrow @safe 387 * { 388 * // There is no default token in the example calculator language, so 389 * // this is always an error. 390 * range.popFront(); 391 * return Token(tok!""); 392 * } 393 * 394 * bool isSeparating(size_t offset) pure nothrow @safe 395 * { 396 * // For this example language, always return true. 397 * return true; 398 * } 399 * } 400 * --- 401 */ 402 mixin template Lexer(Token, alias defaultTokenFunction, 403 alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens, 404 alias possibleDefaultTokens, alias tokenHandlers) 405 { 406 private alias _IDType = typeof(Token.type); 407 private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol); 408 409 static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must" 410 ~ " have a corresponding handler function name."); 411 412 static string generateMask(const ubyte[] arr) 413 { 414 ulong u; 415 for (size_t i = 0; i < arr.length && i < 8; i++) 416 { 417 u |= (cast(ulong) arr[i]) << (i * 8); 418 } 419 420 return toHex(u); 421 } 422 423 private static string generateByteMask(size_t l) 424 { 425 return toHex(ulong.max >> ((8 - l) * 8)); 426 } 427 428 private static size_t calcSplitCount(size_t a, size_t b) pure nothrow 429 { 430 int i; 431 while (true) 432 { 433 i++; 434 a /= 2; 435 if (a < b) 436 break; 437 } 438 return i; 439 } 440 441 private static char[] getBeginningChars(string[] allTokens) 442 { 443 char[] beginningChars; 444 for (size_t i = 0; i < allTokens.length; i++) 445 { 446 if (allTokens[i].length == 0) 447 continue; 448 beginningChars ~= allTokens[i][0]; 449 size_t j = i + 1; 450 while (j < allTokens.length && allTokens[i][0] == allTokens[j][0]) 451 j++; 452 i = j - 1; 453 } 454 return beginningChars; 455 } 456 457 private static string generateStatements() 458 { 459 import std.algorithm : sort; 460 import std.range : stride; 461 462 string[] pseudoTokens = array(tokenHandlers.stride(2)); 463 string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq()); 464 // Array consisting of a sorted list of the first characters of the 465 // tokens. 466 char[] beginningChars = getBeginningChars(allTokens); 467 size_t i = calcSplitCount(beginningChars.length, 8); 468 return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i); 469 } 470 471 private static string generateStatementsStep(string[] allTokens, 472 string[] pseudoTokens, char[] chars, size_t i, string indent = "") 473 { 474 string code; 475 if (i > 0) 476 { 477 size_t p = chars.length / 2; 478 code ~= indent ~ "if (f < "~toHex(chars[p])~") {\n"; 479 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ " "); 480 code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n"; 481 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ " "); 482 code ~= indent ~ "}\n"; 483 } 484 else 485 { 486 code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n"; 487 foreach (char c; chars) 488 { 489 size_t begin; 490 size_t end; 491 for (size_t j = 0; j < allTokens.length; j++) 492 { 493 if (allTokens[j].length == 0 || allTokens[j][0] != c) 494 continue; 495 begin = j; 496 end = j + 1; 497 while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0]) 498 end++; 499 break; 500 } 501 code ~= indent ~ "case "~toHex(c)~":\n"; 502 code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ " "); 503 } 504 code ~= indent ~ "default: goto _defaultTokenFunction;\n"; 505 code ~= indent ~ "}\n"; 506 } 507 508 return code; 509 } 510 511 private static string printCase(string[] tokens, string[] pseudoTokens, string indent) 512 { 513 import std.array : array; 514 import std.algorithm : countUntil; 515 import std.conv : text; 516 string[] sortedTokens = array(sort!"a.length > b.length"(tokens)); 517 518 519 if (tokens.length == 1 && tokens[0].length == 1) 520 { 521 if (pseudoTokens.countUntil(tokens[0]) >= 0) 522 { 523 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 524 ~ "(token);\n" ~ indent ~ "return;\n"; 525 } 526 else if (staticTokens.countUntil(tokens[0]) >= 0) 527 { 528 return indent ~ "range.index++; range.column++;\n" 529 ~ indent ~ "token= Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n" 530 ~ indent ~ "return;"; 531 } 532 else if (pseudoTokens.countUntil(tokens[0]) >= 0) 533 { 534 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 535 ~ "(token);\n" ~ indent ~ "return;\n"; 536 537 } 538 } 539 540 string code; 541 542 bool insertTrailingGoto = true; 543 foreach (i, token; sortedTokens) 544 { 545 immutable mask = generateMask(cast (const ubyte[]) token); 546 if (token.length >= 8) 547 code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n"; 548 else if (token.length != 1) 549 code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n"; 550 if (token.length != 1) 551 code ~= indent ~ "{\n"; 552 if (pseudoTokens.countUntil(token) >= 0) 553 { 554 if (token.length <= 8) 555 { 556 code ~= indent ~ " " 557 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 558 ~ "(token);\n"; 559 code ~= indent ~ "return;\n"; 560 } 561 else 562 { 563 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n"; 564 code ~= indent ~ " " 565 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 566 ~ "();\n"; 567 code ~= indent ~ "return;\n"; 568 } 569 } 570 else if (staticTokens.countUntil(token) >= 0) 571 { 572 if (token.length <= 8) 573 { 574 insertTrailingGoto = false; 575 code ~= indent ~ (token.length != 1 ? " " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 576 code ~= indent ~ (token.length != 1 ? " " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 577 code ~= indent ~ (token.length != 1 ? " " : "") ~ "return;\n"; 578 } 579 else 580 { 581 code ~= indent ~ " pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n"; 582 } 583 } 584 else 585 { 586 // possible default 587 if (token.length <= 8) 588 { 589 code ~= indent ~ " if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n"; 590 code ~= indent ~ " {\n"; 591 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 592 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 593 code ~= indent ~ " return;\n"; 594 code ~= indent ~ " }\n"; 595 code ~= indent ~ " else\n"; 596 code ~= indent ~ " goto _defaultTokenFunction;\n"; 597 } 598 else 599 { 600 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n"; 601 code ~= indent ~ " {\n"; 602 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 603 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 604 code ~= indent ~ " return;\n"; 605 code ~= indent ~ " }\n"; 606 code ~= indent ~ " else\n"; 607 code ~= indent ~ " goto _defaultTokenFunction;\n"; 608 } 609 } 610 if (token.length != 1) 611 { 612 code ~= indent ~ "}\n"; 613 } 614 } 615 if (insertTrailingGoto) 616 code ~= indent ~ "goto _defaultTokenFunction;\n"; 617 return code; 618 } 619 620 /** 621 * Implements the range primitive _front. 622 */ 623 ref const(Token) front()() pure nothrow const @property @safe 624 { 625 return _front; 626 } 627 628 /** 629 * Advances the lexer to the next token and stores the new current token in 630 * the _front variable. 631 */ 632 void _popFront()() pure nothrow @safe 633 { 634 advance(_front); 635 } 636 637 /** 638 * Implements the range primitive _empty. 639 */ 640 bool empty()() pure const nothrow @property @safe @nogc 641 { 642 return _front.type == _tok!"\0"; 643 } 644 645 static string escape(string input) pure @trusted 646 { 647 string retVal; 648 foreach (ubyte c; cast(ubyte[]) input) 649 { 650 switch (c) 651 { 652 case '\\': retVal ~= `\\`; break; 653 case '"': retVal ~= `\"`; break; 654 case '\'': retVal ~= `\'`; break; 655 case '\t': retVal ~= `\t`; break; 656 case '\n': retVal ~= `\n`; break; 657 case '\r': retVal ~= `\r`; break; 658 default: retVal ~= c; break; 659 } 660 } 661 return retVal; 662 } 663 664 enum tokenSearch = generateStatements(); 665 666 static ulong getFront(const ubyte[] arr) pure nothrow @trusted 667 { 668 static union ByteArr { ulong l; ubyte[8] arr; } 669 static assert(ByteArr.sizeof == ulong.sizeof); 670 ByteArr b; 671 b.l = ulong.max; 672 b.arr[0 .. arr.length] = arr[]; 673 return b.l; 674 } 675 676 void advance(ref Token token) pure nothrow @trusted 677 { 678 if (range.index >= range.bytes.length) 679 { 680 token.type = _tok!"\0"; 681 return; 682 } 683 immutable size_t index = range.index; 684 immutable size_t column = range.column; 685 immutable size_t line = range.line; 686 immutable ulong frontBytes = range.index + 8 <= range.bytes.length 687 ? getFront(range.bytes[range.index .. range.index + 8]) 688 : getFront(range.bytes[range.index .. $]); 689 ubyte f = cast(ubyte) frontBytes; 690 // pragma(msg, tokenSearch); 691 mixin(tokenSearch); 692 _defaultTokenFunction: 693 defaultTokenFunction(token); 694 } 695 696 /** 697 * The lexer input. 698 */ 699 LexerRange range; 700 701 /** 702 * The token that is currently at the front of the range. 703 */ 704 Token _front; 705 } 706 707 /** 708 * Range structure that wraps the _lexer's input. 709 */ 710 struct LexerRange 711 { 712 // TODO: When D gets @forceinline the template inline hack (i.e 713 // `void front()() { ... }` )should be removed. 714 715 public nothrow pure @safe @nogc: 716 /** 717 * Params: 718 * bytes = the _lexer input 719 * index = the initial offset from the beginning of $(D_PARAM bytes) 720 * column = the initial _column number 721 * line = the initial _line number 722 */ 723 this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) 724 { 725 this.bytes = bytes; 726 this.index = index; 727 this.column = column; 728 this.line = line; 729 } 730 731 /** 732 * Returns: a mark at the current position that can then be used with slice. 733 */ 734 size_t mark()() const 735 { 736 return index; 737 } 738 739 /** 740 * Sets the range to the given position. 741 * Params: m = the position to seek to 742 */ 743 void seek()(size_t m) 744 { 745 index = m; 746 } 747 748 /** 749 * Returs a slice of the input byte array between the given mark and the 750 * current position. 751 * Params m = the beginning index of the slice to return 752 */ 753 const(ubyte)[] slice()(size_t m) const 754 { 755 return bytes[m .. index]; 756 } 757 758 /** 759 * Implements the range primitive _empty. 760 */ 761 bool empty()() const 762 { 763 return index >= bytes.length; 764 } 765 766 /** 767 * Implements the range primitive _front. 768 */ 769 ubyte front()() const 770 { 771 return bytes[index]; 772 } 773 774 /** 775 * Returns: the current item as well as the items $(D_PARAM p) items ahead. 776 */ 777 const(ubyte)[] peek(size_t p) const 778 { 779 return index + p + 1 > bytes.length 780 ? bytes[index .. $] 781 : bytes[index .. index + p + 1]; 782 } 783 784 /** 785 * Returns: true if the range starts with the given byte sequence 786 */ 787 bool startsWith(const(ubyte[]) needle) const 788 { 789 if (needle.length + index >= bytes.length) 790 return false; 791 foreach (i; 0 .. needle.length) 792 if (needle[i] != bytes[index + i]) 793 return false; 794 return true; 795 } 796 797 /** 798 * 799 */ 800 ubyte peekAt()(size_t offset) const 801 { 802 return bytes[index + offset]; 803 } 804 805 /** 806 * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead. 807 */ 808 bool canPeek()(size_t p) const 809 { 810 return index + p < bytes.length; 811 } 812 813 /** 814 * Implements the range primitive _popFront. 815 */ 816 void popFront()() 817 { 818 index++; 819 column++; 820 } 821 822 /** 823 * Implements the algorithm _popFrontN more efficiently. This function does 824 * not detect or handle newlines. 825 */ 826 void popFrontN()(size_t n) 827 { 828 index += n; 829 column += n; 830 } 831 832 /** 833 * Increments the range's line number and resets the column counter. 834 */ 835 void incrementLine()(size_t i = 1) 836 { 837 column = 1; 838 line += i; 839 } 840 841 /** 842 * The input _bytes. 843 */ 844 const(ubyte)[] bytes; 845 846 /** 847 * The range's current position. 848 */ 849 size_t index; 850 851 /** 852 * The current _column number. 853 */ 854 size_t column; 855 856 /** 857 * The current _line number. 858 */ 859 size_t line; 860 } 861 862 nothrow @safe 863 private void toHexInternal(char[] where, ubyte b) { 864 if(b < 16) 865 where[0] = '0'; 866 else { 867 ubyte t = (b & 0xf0) >> 4; 868 if(t >= 10) 869 where[0] = cast(char) ('A' + t - 10); 870 else 871 where[0] = cast(char) ('0' + t); 872 b &= 0x0f; 873 } 874 if(b >= 10) 875 where[1] = cast(char) ('A' + b - 10); 876 else 877 where[1] = cast(char) ('0' + b); 878 } 879 880 private void toHexInternal(char[] where, ulong b) { 881 foreach(i; 0 .. 8) 882 toHexInternal(where[i * 2 .. i * 2 + 2], cast(ubyte) (b >> ((7-i) * 8))); 883 } 884 885 string toHex(ulong w) { 886 char[18] buffer; 887 buffer[0] = '0'; 888 buffer[1] = 'x'; 889 toHexInternal(buffer[2 .. $], w); 890 return buffer.idup; 891 }