std.experimental.lexer source code

1 // Written in the D programming language
2 
3 /**
4  * $(H2 Summary)
5  * This module contains a range-based compile-time _lexer generator.
6  *
7  * $(H2 Overview)
8  * The _lexer generator consists of a template mixin, $(LREF Lexer), along with
9  * several helper templates for generating such things as token identifiers.
10  *
11  * To write a _lexer using this API:
12  * $(OL
13  *     $(LI Create the string array constants for your language.
14  *         $(UL
15  *             $(LI $(LINK2 #.staticTokens, staticTokens))
16  *             $(LI $(LINK2 #.dynamicTokens, dynamicTokens))
17  *             $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens))
18  *             $(LI $(LINK2 #.tokenHandlers, tokenHandlers))
19  *         ))
20  *     $(LI Create aliases for the various token and token identifier types
21  *         specific to your language.
22  *         $(UL
23  *             $(LI $(LREF TokenIdType))
24  *             $(LI $(LREF tokenStringRepresentation))
25  *             $(LI $(LREF TokenStructure))
26  *             $(LI $(LREF TokenId))
27  *         ))
28  *     $(LI Create a struct that mixes in the Lexer template mixin and
29  *         implements the necessary functions.
30  *         $(UL
31  *             $(LI $(LREF Lexer))
32  *         ))
33  * )
34  * Examples:
35  * $(UL
36  * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).)
37  * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).)
38  * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).)
39  * )
40  * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions)
41  * $(DL
42  * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction)
43  * $(DD A function that serves as the default token lexing function. For most
44  *     languages this will be the identifier lexing function.))
45  * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction))
46  * $(DD A function that is able to determine if an identifier/keyword has come
47  *     to an end. This function must return bool and take a single size_t
48  *     argument representing the number of bytes to skip over before looking for
49  *     a separating character.)
50  * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens))
51  * $(DD A listing of the tokens whose exact value never changes and which cannot
52  *     possibly be a token handled by the default token lexing function. The
53  *     most common example of this kind of token is an operator such as
54  *     $(D_STRING "*"), or $(D_STRING "-") in a programming language.)
55  * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens))
56  * $(DD A listing of tokens whose value is variable, such as whitespace,
57  *     identifiers, number literals, and string literals.)
58  * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens))
59  * $(DD A listing of tokens that could posibly be one of the tokens handled by
60  *     the default token handling function. An common example of this is
61  *     a keyword such as $(D_STRING "for"), which looks like the beginning of
62  *     the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is
63  *     called to determine if the character after the $(D_STRING 'r') separates
64  *     the identifier, indicating that the token is $(D_STRING "for"), or if
65  *     lexing should be turned over to the $(B defaultTokenFunction).)
66  * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers))
67  * $(DD A mapping of prefixes to custom token handling function names. The
68  *     generated _lexer will search for the even-index elements of this array,
69  *     and then call the function whose name is the element immedately after the
70  *     even-indexed element. This is used for lexing complex tokens whose prefix
71  *     is fixed.)
72  * )
73  *
74  * Here are some example constants for a simple calculator _lexer:
75  * ---
76  * // There are a near infinite number of valid number literals, so numbers are
77  * // dynamic tokens.
78  * enum string[] dynamicTokens = ["numberLiteral", "whitespace"];
79  *
80  * // The operators are always the same, and cannot start a numberLiteral, so
81  * // they are staticTokens
82  * enum string[] staticTokens = ["-", "+", "*", "/"];
83  *
84  * // In this simple example there are no keywords or other tokens that could
85  * // look like dynamic tokens, so this is blank.
86  * enum string[] possibleDefaultTokens = [];
87  *
88  * // If any whitespace character or digit is encountered, pass lexing over to
89  * // our custom handler functions. These will be demonstrated in an example
90  * // later on.
91  * enum string[] tokenHandlers = [
92  *     "0", "lexNumber",
93  *     "1", "lexNumber",
94  *     "2", "lexNumber",
95  *     "3", "lexNumber",
96  *     "4", "lexNumber",
97  *     "5", "lexNumber",
98  *     "6", "lexNumber",
99  *     "7", "lexNumber",
100  *     "8", "lexNumber",
101  *     "9", "lexNumber",
102  *     " ", "lexWhitespace",
103  *     "\n", "lexWhitespace",
104  *     "\t", "lexWhitespace",
105  *     "\r", "lexWhitespace"
106  * ];
107  * ---
108  *
109  * Copyright: Brian Schott 2013
110  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
111  * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu
112  * Source: $(PHOBOSSRC std/experimental/_lexer.d)
113  */
114 
115 module std.experimental.lexer;
116 
117 /**
118  * Template for determining the type used for a token type.
119  *
120  * Selects the smallest unsigned integral type that is able to hold the value
121  * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length.
122  * For example if there are 20 static tokens, 30 dynamic tokens,
123  * and 10 possible default tokens, this template will alias itself to ubyte,
124  * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max.
125  * Examples:
126  * ---
127  * // In our calculator example this means that IdType is an alias for ubyte.
128  * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
129  * ---
130  */
131 template TokenIdType(alias staticTokens, alias dynamicTokens,
132     alias possibleDefaultTokens)
133 {
134     immutable tokenCount = staticTokens.length + dynamicTokens.length
135         + possibleDefaultTokens.length + 1;
136     static if (tokenCount <= ubyte.max)
137         alias TokenIdType = ubyte;
138     else static if (tokenCount <= ushort.max)
139         alias TokenIdType = ushort;
140     else static if (tokenCount <= uint.max)
141         alias TokenIdType = uint;
142     else
143         static assert (false, "The number of tokens must be less than uint.max");
144 }
145 
146 /**
147  * Looks up the string representation of the given token type.
148  *
149  * This is the opposite of the function of the TokenId template.
150  * Params: type = the token type identifier
151  * Examples:
152  * ---
153  * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
154  * assert (str(tok!"*") == "*");
155  * ---
156  * See_also: $(LREF TokenId)
157  */
158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens,
159     alias possibleDefaultTokens)(IdType type) @property
160 {
161     enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
162 
163     if (type == 0)
164         return "!ERROR!";
165     else if (type < tokens.length + 1)
166         return tokens[type - 1];
167     else
168         return null;
169 }
170 
171 unittest
172 {
173     alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]);
174     enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token);
175     alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]);
176 
177     static assert (str(tok!"foo") == "foo");
178     static assert (str(tok!"bar") == "bar");
179     static assert (str(tok!"doo") == "doo");
180 }
181 
182 /**
183  * Generates the token type identifier for the given symbol.
184  *
185  * There are two special cases:
186  * $(UL
187  *     $(LI If symbol is $(D_STRING ""), then the token identifier will be 0)
188  *     $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum
189  *         valid token type identifier)
190  * )
191  * In all cases this template will alias itself to a constant of type IdType.
192  * This template will fail at compile time if $(D_PARAM symbol) is not one of
193  * the staticTokens, dynamicTokens, or possibleDefaultTokens.
194  * Examples:
195  * ---
196  * template tok(string symbol)
197  * {
198  *     alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
199  *         possibleDefaultTokens, symbol);
200  * }
201  * // num and plus are of type ubyte.
202  * IdType plus = tok!"+";
203  * IdType num = tok!"numberLiteral";
204  * ---
205  */
206 template TokenId(IdType, alias staticTokens, alias dynamicTokens,
207     alias possibleDefaultTokens, string symbol)
208 {
209     enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
210 
211     import std.algorithm;
212     static if (symbol == "")
213     {
214         enum id = 0;
215         alias TokenId = id;
216     }
217     else static if (symbol == "\0")
218     {
219         enum id = 1 + tokens.length;
220         alias TokenId = id;
221     }
222     else
223     {
224         enum i = tokens.countUntil(symbol);
225         static if (i != -1)
226         {
227             enum id = i + 1;
228             static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
229             alias TokenId = id;
230         }
231         else
232             static assert (0, "Invalid token: " ~ symbol);
233     }
234 }
235 
236 /**
237  * The token that is returned by the lexer.
238  * Params:
239  *     IdType = The D type of the "type" token type field.
240  *     extraFields = A string containing D code for any extra fields that should
241  *         be included in the token structure body. This string is passed
242  *         directly to a mixin statement.
243  * Examples:
244  * ---
245  * // No extra struct fields are desired in this example, so leave it blank.
246  * alias Token = TokenStructure!(IdType, "");
247  * Token minusToken = Token(tok!"-");
248  * ---
249  */
250 struct TokenStructure(IdType, string extraFields = "")
251 {
252 public pure nothrow @safe @nogc:
253 
254     bool opEquals(ref const typeof(this) other) const
255     {
256         return this.type == other.type && this.text == other.text;
257     }
258 
259     /**
260      * Returs: true if the token has the given type, false otherwise.
261      */
262     bool opEquals(IdType type) const
263     {
264         return this.type == type;
265     }
266 
267     /**
268      * Constructs a token from a token type.
269      * Params: type = the token type
270      */
271     this(IdType type)
272     {
273         this.type = type;
274     }
275 
276     /**
277      * Constructs a token.
278      * Params:
279      *     type = the token type
280      *     text = the text of the token, which may be null
281      *     line = the line number at which this token occurs
282      *     column = the column number at which this token occurs
283      *     index = the byte offset from the beginning of the input at which this
284      *         token occurs
285      */
286     this(IdType type, string text, size_t line, size_t column, size_t index)
287     {
288         this.text = text;
289         this.line = line;
290         this.column = column;
291         this.type = type;
292         this.index = index;
293     }
294 
295     /**
296      * The _text of the token.
297      */
298     string text;
299 
300     /**
301      * The _line number at which this token occurs.
302      */
303     size_t line;
304 
305     /**
306      * The _column number at which this token occurs. This is measured in bytes
307      * and may not be correct when tab characters are involved.
308      */
309     size_t column;
310 
311     /**
312      * The byte offset from the beginning of the input at which this token
313      * occurs.
314      */
315     size_t index;
316 
317     /**
318      * The token type.
319      */
320     IdType type;
321 
322     mixin (extraFields);
323 }
324 
325 /**
326  * The implementation of the _lexer is contained within this mixin template.
327  *
328  * To use it, this template should be mixed in to a struct that represents the
329  * _lexer for your language. This struct should implement the following methods:
330  * $(UL
331  *     $(LI popFront, which should call this mixin's _popFront() and
332  *         additionally perform any token filtering or shuffling you deem
333  *         necessary. For example, you can implement popFront to skip comment or
334  *          tokens.)
335  *     $(LI A function that serves as the default token lexing function. For
336  *         most languages this will be the identifier lexing function. This
337  *         should then be passed to the $(LREF Lexer) template mixin as the
338  *         $(LINK2 #.defaultTokenFunction defaultTokenFunction) template
339  *         parameter.)
340  *     $(LI A function that is able to determine if an identifier/keyword has
341  *         come to an end. This function must return $(D_KEYWORD bool) and take
342  *         a single $(D_KEYWORD size_t) argument representing the number of
343  *         bytes to skip over before looking for a separating character.)
344  *     $(LI Any functions referred to in the tokenHandlers template paramater.
345  *         These functions must be marked $(D_KEYWORD pure nothrow), take no
346  *         arguments, and return a token)
347  *     $(LI A constructor that initializes the range field as well as calls
348  *         popFront() exactly once (to initialize the _front field).)
349  * )
350  * Params:
351  *     Token = $(LREF TokenStructure)
352  *     defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction)
353  *     tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction)
354  *     staticTokens = $(LINK2 #.staticTokens, staticTokens)
355  *     dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens)
356  *     possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)
357  *     tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers)
358  * Examples:
359  * ---
360  * struct CalculatorLexer
361  * {
362  *     mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating,
363  *         staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers);
364  *
365  *     this (ubyte[] bytes)
366  *     {
367  *         this.range = LexerRange(bytes);
368  *         popFront();
369  *     }
370  *
371  *     void popFront() pure
372  *     {
373  *         _popFront();
374  *     }
375  *
376  *     Token lexNumber() pure nothrow @safe
377  *     {
378  *         // implementation goes here
379  *     }
380  *
381  *     Token lexWhitespace() pure nothrow @safe
382  *     {
383  *         // implementation goes here
384  *     }
385  *
386  *     Token defaultTokenFunction() pure nothrow @safe
387  *     {
388  *         // There is no default token in the example calculator language, so
389  *         // this is always an error.
390  *         range.popFront();
391  *         return Token(tok!"");
392  *     }
393  *
394  *     bool isSeparating(size_t offset) pure nothrow @safe
395  *     {
396  *         // For this example language, always return true.
397  *         return true;
398  *     }
399  * }
400  * ---
401  */
402 mixin template Lexer(Token, alias defaultTokenFunction,
403     alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens,
404     alias possibleDefaultTokens, alias tokenHandlers)
405 {
406     private alias _IDType = typeof(Token.type);
407     private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol);
408 
409     static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must"
410         ~ " have a corresponding handler function name.");
411 
412     static string generateMask(const ubyte[] arr)
413     {
414         ulong u;
415         for (size_t i = 0; i < arr.length && i < 8; i++)
416         {
417             u |= (cast(ulong) arr[i]) << (i * 8);
418         }
419 
420 	return toHex(u);
421     }
422 
423     private static string generateByteMask(size_t l)
424     {
425     	return toHex(ulong.max >> ((8 - l) * 8));
426     }
427 
428     private static size_t calcSplitCount(size_t a, size_t b) pure nothrow
429     {
430         int i;
431         while (true)
432         {
433             i++;
434             a /= 2;
435             if (a < b)
436                 break;
437         }
438         return i;
439     }
440 
441     private static char[] getBeginningChars(string[] allTokens)
442     {
443         char[] beginningChars;
444         for (size_t i = 0; i < allTokens.length; i++)
445         {
446             if (allTokens[i].length == 0)
447                 continue;
448             beginningChars ~= allTokens[i][0];
449             size_t j = i + 1;
450             while (j < allTokens.length && allTokens[i][0] == allTokens[j][0])
451                 j++;
452             i = j - 1;
453         }
454         return beginningChars;
455     }
456 
457     private static string generateStatements()
458     {
459         import std.algorithm : sort;
460         import std.range : stride;
461 
462         string[] pseudoTokens = array(tokenHandlers.stride(2));
463         string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq());
464         // Array consisting of a sorted list of the first characters of the
465         // tokens.
466         char[] beginningChars = getBeginningChars(allTokens);
467         size_t i = calcSplitCount(beginningChars.length, 8);
468         return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i);
469     }
470 
471     private static string generateStatementsStep(string[] allTokens,
472         string[] pseudoTokens, char[] chars, size_t i, string indent = "")
473     {
474         string code;
475         if (i > 0)
476         {
477             size_t p = chars.length / 2;
478             code ~= indent ~ "if (f < "~toHex(chars[p])~") {\n";
479             code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ "    ");
480             code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n";
481             code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ "    ");
482             code ~= indent ~ "}\n";
483         }
484         else
485         {
486             code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n";
487             foreach (char c; chars)
488             {
489                 size_t begin;
490                 size_t end;
491                 for (size_t j = 0; j < allTokens.length; j++)
492                 {
493                     if (allTokens[j].length == 0 || allTokens[j][0] != c)
494                         continue;
495                     begin = j;
496                     end = j + 1;
497                     while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0])
498                         end++;
499                     break;
500                 }
501                 code ~= indent ~ "case "~toHex(c)~":\n";
502                 code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ "    ");
503             }
504             code ~= indent ~ "default: goto _defaultTokenFunction;\n";
505             code ~= indent ~ "}\n";
506         }
507 
508         return code;
509     }
510 
511     private static string printCase(string[] tokens, string[] pseudoTokens, string indent)
512     {
513         import std.array : array;
514         import std.algorithm : countUntil;
515         import std.conv : text;
516         string[] sortedTokens = array(sort!"a.length > b.length"(tokens));
517 
518 
519         if (tokens.length == 1 && tokens[0].length == 1)
520         {
521             if (pseudoTokens.countUntil(tokens[0]) >= 0)
522             {
523                 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
524                     ~ "(token);\n" ~ indent ~ "return;\n";
525             }
526             else if (staticTokens.countUntil(tokens[0]) >= 0)
527             {
528                 return indent ~ "range.index++; range.column++;\n"
529                     ~ indent ~ "token= Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n"
530                     ~ indent ~ "return;";
531             }
532             else if (pseudoTokens.countUntil(tokens[0]) >= 0)
533             {
534                 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
535                     ~ "(token);\n" ~ indent ~ "return;\n";
536 
537             }
538         }
539 
540         string code;
541 
542         bool insertTrailingGoto = true;
543         foreach (i, token; sortedTokens)
544         {
545             immutable mask = generateMask(cast (const ubyte[]) token);
546             if (token.length >= 8)
547                 code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n";
548             else if (token.length != 1)
549                 code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n";
550             if (token.length != 1)
551                 code ~= indent ~ "{\n";
552             if (pseudoTokens.countUntil(token) >= 0)
553             {
554                 if (token.length <= 8)
555                 {
556                     code ~= indent ~ "    "
557                         ~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
558                         ~ "(token);\n";
559                     code ~= indent ~ "return;\n";
560                 }
561                 else
562                 {
563                     code ~= indent ~ "    if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n";
564                     code ~= indent ~ "        "
565                         ~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
566                         ~ "();\n";
567                     code ~= indent ~ "return;\n";
568                 }
569             }
570             else if (staticTokens.countUntil(token) >= 0)
571             {
572                 if (token.length <= 8)
573                 {
574                     insertTrailingGoto = false;
575                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
576                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
577                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "return;\n";
578                 }
579                 else
580                 {
581                     code ~= indent ~ "    pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n";
582                 }
583             }
584             else
585             {
586                 // possible default
587                 if (token.length <= 8)
588                 {
589                     code ~= indent ~ "    if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n";
590                     code ~= indent ~ "    {\n";
591                     code ~= indent ~ "        range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
592                     code ~= indent ~ "        token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
593                     code ~= indent ~ "        return;\n";
594                     code ~= indent ~ "    }\n";
595                     code ~= indent ~ "    else\n";
596                     code ~= indent ~ "        goto _defaultTokenFunction;\n";
597                 }
598                 else
599                 {
600                     code ~= indent ~ "    if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n";
601                     code ~= indent ~ "    {\n";
602                     code ~= indent ~ "        range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
603                     code ~= indent ~ "        token =  Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
604                     code ~= indent ~ "        return;\n";
605                     code ~= indent ~ "    }\n";
606                     code ~= indent ~ "    else\n";
607                     code ~= indent ~ "        goto _defaultTokenFunction;\n";
608                 }
609             }
610             if (token.length != 1)
611             {
612                 code ~= indent ~ "}\n";
613             }
614         }
615         if (insertTrailingGoto)
616             code ~= indent ~ "goto _defaultTokenFunction;\n";
617         return code;
618     }
619 
620     /**
621      * Implements the range primitive _front.
622      */
623     ref const(Token) front()() pure nothrow const @property @safe
624     {
625         return _front;
626     }
627 
628     /**
629      * Advances the lexer to the next token and stores the new current token in
630      * the _front variable.
631      */
632     void _popFront()() pure nothrow @safe
633     {
634         advance(_front);
635     }
636 
637     /**
638      * Implements the range primitive _empty.
639      */
640     bool empty()() pure const nothrow @property @safe @nogc
641     {
642         return _front.type == _tok!"\0";
643     }
644 
645     static string escape(string input) pure @trusted
646     {
647         string retVal;
648         foreach (ubyte c; cast(ubyte[]) input)
649         {
650             switch (c)
651             {
652             case '\\': retVal ~= `\\`; break;
653             case '"': retVal ~= `\"`; break;
654             case '\'': retVal ~= `\'`; break;
655             case '\t': retVal ~= `\t`; break;
656             case '\n': retVal ~= `\n`; break;
657             case '\r': retVal ~= `\r`; break;
658             default: retVal ~= c; break;
659             }
660         }
661         return retVal;
662     }
663 
664     enum tokenSearch = generateStatements();
665 
666     static ulong getFront(const ubyte[] arr) pure nothrow @trusted
667     {
668         static union ByteArr { ulong l; ubyte[8] arr; }
669         static assert(ByteArr.sizeof == ulong.sizeof);
670         ByteArr b;
671         b.l = ulong.max;
672         b.arr[0 .. arr.length] = arr[];
673         return b.l;
674     }
675 
676     void advance(ref Token token) pure nothrow @trusted
677     {
678         if (range.index >= range.bytes.length)
679         {
680             token.type = _tok!"\0";
681             return;
682         }
683         immutable size_t index = range.index;
684         immutable size_t column = range.column;
685         immutable size_t line = range.line;
686         immutable ulong frontBytes = range.index + 8 <= range.bytes.length
687             ? getFront(range.bytes[range.index .. range.index + 8])
688             : getFront(range.bytes[range.index .. $]);
689         ubyte f = cast(ubyte) frontBytes;
690 //        pragma(msg, tokenSearch);
691         mixin(tokenSearch);
692     _defaultTokenFunction:
693         defaultTokenFunction(token);
694     }
695 
696     /**
697      * The lexer input.
698      */
699     LexerRange range;
700 
701     /**
702      * The token that is currently at the front of the range.
703      */
704     Token _front;
705 }
706 
707 /**
708  * Range structure that wraps the _lexer's input.
709  */
710 struct LexerRange
711 {
712     // TODO: When D gets @forceinline the template inline hack (i.e
713     // `void front()() { ... }` )should be removed.
714 
715 public nothrow pure @safe @nogc:
716     /**
717      * Params:
718      *     bytes = the _lexer input
719      *     index = the initial offset from the beginning of $(D_PARAM bytes)
720      *     column = the initial _column number
721      *     line = the initial _line number
722      */
723     this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1)
724     {
725         this.bytes = bytes;
726         this.index = index;
727         this.column = column;
728         this.line = line;
729     }
730 
731     /**
732      * Returns: a mark at the current position that can then be used with slice.
733      */
734     size_t mark()() const
735     {
736         return index;
737     }
738 
739     /**
740      * Sets the range to the given position.
741      * Params: m = the position to seek to
742      */
743     void seek()(size_t m)
744     {
745         index = m;
746     }
747 
748     /**
749      * Returs a slice of the input byte array between the given mark and the
750      * current position.
751      * Params m = the beginning index of the slice to return
752      */
753     const(ubyte)[] slice()(size_t m) const
754     {
755         return bytes[m .. index];
756     }
757 
758     /**
759      * Implements the range primitive _empty.
760      */
761     bool empty()() const
762     {
763         return index >= bytes.length;
764     }
765 
766     /**
767      * Implements the range primitive _front.
768      */
769     ubyte front()() const
770     {
771         return bytes[index];
772     }
773 
774     /**
775      * Returns: the current item as well as the items $(D_PARAM p) items ahead.
776      */
777     const(ubyte)[] peek(size_t p) const
778     {
779         return index + p + 1 > bytes.length
780             ? bytes[index .. $]
781             : bytes[index .. index + p + 1];
782     }
783 
784     /**
785      * Returns: true if the range starts with the given byte sequence
786      */
787     bool startsWith(const(ubyte[]) needle) const
788     {
789         if (needle.length + index >= bytes.length)
790             return false;
791         foreach (i; 0 .. needle.length)
792             if (needle[i] != bytes[index + i])
793                 return false;
794         return true;
795     }
796 
797     /**
798      *
799      */
800     ubyte peekAt()(size_t offset) const
801     {
802         return bytes[index + offset];
803     }
804 
805     /**
806      * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead.
807      */
808     bool canPeek()(size_t p) const
809     {
810         return index + p < bytes.length;
811     }
812 
813     /**
814      * Implements the range primitive _popFront.
815      */
816     void popFront()()
817     {
818         index++;
819         column++;
820     }
821 
822     /**
823      * Implements the algorithm _popFrontN more efficiently. This function does
824      * not detect or handle newlines.
825      */
826     void popFrontN()(size_t n)
827     {
828         index += n;
829         column += n;
830     }
831 
832     /**
833      * Increments the range's line number and resets the column counter.
834      */
835     void incrementLine()(size_t i = 1)
836     {
837         column = 1;
838         line += i;
839     }
840 
841     /**
842      * The input _bytes.
843      */
844     const(ubyte)[] bytes;
845 
846     /**
847      * The range's current position.
848      */
849     size_t index;
850 
851     /**
852      * The current _column number.
853      */
854     size_t column;
855 
856     /**
857      * The current _line number.
858      */
859     size_t line;
860 }
861 
862 nothrow @safe
863 private void toHexInternal(char[] where, ubyte b) {
864         if(b < 16)
865                 where[0] = '0';
866         else {
867                 ubyte t = (b & 0xf0) >> 4;
868                 if(t >= 10)
869                         where[0] = cast(char) ('A' + t - 10);
870                 else
871                         where[0] = cast(char) ('0' + t);
872                 b &= 0x0f;
873         }
874         if(b >= 10)
875                 where[1] = cast(char) ('A' + b - 10);
876         else
877                 where[1] = cast(char) ('0' + b);
878 }
879 
880 private void toHexInternal(char[] where, ulong b) {
881         foreach(i; 0 .. 8)
882                 toHexInternal(where[i * 2 .. i * 2 + 2], cast(ubyte) (b >> ((7-i) * 8)));
883 }
884 
885 string toHex(ulong w) {
886 	char[18]  buffer;
887 	buffer[0] = '0';
888 	buffer[1] = 'x';
889 	toHexInternal(buffer[2 .. $], w);
890 	return buffer.idup;
891 }