1 module dparse.lexer;
2 
3 import std.typecons;
4 import std.typetuple;
5 import std.array;
6 import std.algorithm;
7 import std.range;
8 import std.experimental.lexer;
9 import core.cpuid : sse42;
10 version (D_InlineAsm_X86_64)
11 {
12     version (Windows) {}
13     else version = iasm64NotWindows;
14 }
15 
16 /// Operators
17 private enum operators = [
18     ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
19     "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
20     "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
21     "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
22     "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
23 ];
24 
25 /// Kewords
26 private enum keywords = [
27     "abstract", "alias", "align", "asm", "assert", "auto", "bool",
28     "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
29     "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
30     "delegate", "delete", "deprecated", "do", "double", "else", "enum",
31     "export", "extern", "false", "final", "finally", "float", "for", "foreach",
32     "foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
33     "immutable", "import", "in", "inout", "int", "interface", "invariant",
34     "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
35     "null", "out", "override", "package", "pragma", "private", "protected",
36     "public", "pure", "real", "ref", "return", "scope", "shared", "short",
37     "static", "struct", "super", "switch", "synchronized", "template", "this",
38     "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
39     "uint", "ulong", "union", "unittest", "ushort", "version", "void",
40     "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
41     "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters",
42     "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector",
43     "__VENDOR__", "__VERSION__"
44 ];
45 
46 /// Other tokens
47 private enum dynamicTokens = [
48     "specialTokenSequence", "comment", "identifier", "scriptLine",
49     "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
50     "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
51     "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
52     "dstringLiteral", "stringLiteral", "wstringLiteral"
53 ];
54 
55 private enum pseudoTokenHandlers = [
56     "\"", "lexStringLiteral",
57     "`", "lexWysiwygString",
58     "//", "lexSlashSlashComment",
59     "/*", "lexSlashStarComment",
60     "/+", "lexSlashPlusComment",
61     ".", "lexDot",
62     "'", "lexCharacterLiteral",
63     "0", "lexNumber",
64     "1", "lexDecimal",
65     "2", "lexDecimal",
66     "3", "lexDecimal",
67     "4", "lexDecimal",
68     "5", "lexDecimal",
69     "6", "lexDecimal",
70     "7", "lexDecimal",
71     "8", "lexDecimal",
72     "9", "lexDecimal",
73     "q\"", "lexDelimitedString",
74     "q{", "lexTokenString",
75     "r\"", "lexWysiwygString",
76     "x\"", "lexHexString",
77     " ", "lexWhitespace",
78     "\t", "lexWhitespace",
79     "\r", "lexWhitespace",
80     "\n", "lexWhitespace",
81     "\v", "lexWhitespace",
82     "\f", "lexWhitespace",
83     "\u2028", "lexLongNewline",
84     "\u2029", "lexLongNewline",
85     "#!", "lexScriptLine",
86     "#line", "lexSpecialTokenSequence"
87 ];
88 
89 /// Token ID type for the D lexer.
90 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
91 
92 /**
93  * Function used for converting an IdType to a string.
94  *
95  * Examples:
96  * ---
97  * IdType c = tok!"case";
98  * assert (str(c) == "case");
99  * ---
100  */
101 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
102 
103 /**
104  * Template used to refer to D token types.
105  *
106  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
107  * values that can be passed to this template.
108  * Example:
109  * ---
110  * import dparse.lexer;
111  * IdType t = tok!"floatLiteral";
112  * ---
113  */
114 public template tok(string token)
115 {
116     alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
117 }
118 
119 private enum extraFields = q{
120     string comment;
121     string trailingComment;
122 
123     int opCmp(size_t i) const pure nothrow @safe {
124         if (index < i) return -1;
125         if (index > i) return 1;
126         return 0;
127     }
128 
129     int opCmp(ref const typeof(this) other) const pure nothrow @safe {
130         return opCmp(other.index);
131     }
132 };
133 
134 /// The token type in the D lexer
135 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields);
136 
137 /**
138  * Configure whitespace handling
139  */
140 public enum WhitespaceBehavior : ubyte
141 {
142     include = 0b0000_0000,
143     skip = 0b0000_0001,
144 }
145 
146 /**
147  * Configure string lexing behavior
148  */
149 public enum StringBehavior : ubyte
150 {
151     /// Do not include quote characters, process escape sequences
152     compiler = 0b0000_0000,
153     /// Opening quotes, closing quotes, and string suffixes are included in the
154     /// string token
155     includeQuoteChars = 0b0000_0001,
156     /// String escape sequences are not replaced
157     notEscaped = 0b0000_0010,
158     /// Not modified at all. Useful for formatters or highlighters
159     source = includeQuoteChars | notEscaped
160 }
161 
162 /**
163  * Lexer configuration struct
164  */
165 public struct LexerConfig
166 {
167     string fileName;
168     StringBehavior stringBehavior;
169     WhitespaceBehavior whitespaceBehavior;
170 }
171 
172 /**
173  * Returns: true if the given ID is for a basic type.
174  */
175 public bool isBasicType(IdType type) nothrow pure @safe @nogc
176 {
177     switch (type)
178     {
179     case tok!"int":
180     case tok!"uint":
181     case tok!"double":
182     case tok!"idouble":
183     case tok!"float":
184     case tok!"ifloat":
185     case tok!"short":
186     case tok!"ushort":
187     case tok!"long":
188     case tok!"ulong":
189     case tok!"char":
190     case tok!"wchar":
191     case tok!"dchar":
192     case tok!"bool":
193     case tok!"void":
194     case tok!"cent":
195     case tok!"ucent":
196     case tok!"real":
197     case tok!"ireal":
198     case tok!"byte":
199     case tok!"ubyte":
200     case tok!"cdouble":
201     case tok!"cfloat":
202     case tok!"creal":
203         return true;
204     default:
205         return false;
206     }
207 }
208 
209 /**
210  * Returns: true if the given ID type is for a number literal.
211  */
212 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc
213 {
214     switch (type)
215     {
216     case tok!"doubleLiteral":
217     case tok!"floatLiteral":
218     case tok!"idoubleLiteral":
219     case tok!"ifloatLiteral":
220     case tok!"intLiteral":
221     case tok!"longLiteral":
222     case tok!"realLiteral":
223     case tok!"irealLiteral":
224     case tok!"uintLiteral":
225     case tok!"ulongLiteral":
226         return true;
227     default:
228         return false;
229     }
230 }
231 
232 /**
233  * Returns: true if the given ID type is for an operator.
234  */
235 public bool isOperator(IdType type) nothrow pure @safe @nogc
236 {
237     switch (type)
238     {
239     case tok!",":
240     case tok!".":
241     case tok!"..":
242     case tok!"...":
243     case tok!"/":
244     case tok!"/=":
245     case tok!"!":
246     case tok!"!<":
247     case tok!"!<=":
248     case tok!"!<>":
249     case tok!"!<>=":
250     case tok!"!=":
251     case tok!"!>":
252     case tok!"!>=":
253     case tok!"$":
254     case tok!"%":
255     case tok!"%=":
256     case tok!"&":
257     case tok!"&&":
258     case tok!"&=":
259     case tok!"(":
260     case tok!")":
261     case tok!"*":
262     case tok!"*=":
263     case tok!"+":
264     case tok!"++":
265     case tok!"+=":
266     case tok!"-":
267     case tok!"--":
268     case tok!"-=":
269     case tok!":":
270     case tok!";":
271     case tok!"<":
272     case tok!"<<":
273     case tok!"<<=":
274     case tok!"<=":
275     case tok!"<>":
276     case tok!"<>=":
277     case tok!"=":
278     case tok!"==":
279     case tok!"=>":
280     case tok!">":
281     case tok!">=":
282     case tok!">>":
283     case tok!">>=":
284     case tok!">>>":
285     case tok!">>>=":
286     case tok!"?":
287     case tok!"@":
288     case tok!"[":
289     case tok!"]":
290     case tok!"^":
291     case tok!"^=":
292     case tok!"^^":
293     case tok!"^^=":
294     case tok!"{":
295     case tok!"|":
296     case tok!"|=":
297     case tok!"||":
298     case tok!"}":
299     case tok!"~":
300     case tok!"~=":
301         return true;
302     default:
303         return false;
304     }
305 }
306 
307 /**
308  * Returns: true if the given ID type is for a keyword.
309  */
310 public bool isKeyword(IdType type) pure nothrow @safe @nogc
311 {
312     switch (type)
313     {
314     case tok!"abstract":
315     case tok!"alias":
316     case tok!"align":
317     case tok!"asm":
318     case tok!"assert":
319     case tok!"auto":
320     case tok!"break":
321     case tok!"case":
322     case tok!"cast":
323     case tok!"catch":
324     case tok!"class":
325     case tok!"const":
326     case tok!"continue":
327     case tok!"debug":
328     case tok!"default":
329     case tok!"delegate":
330     case tok!"delete":
331     case tok!"deprecated":
332     case tok!"do":
333     case tok!"else":
334     case tok!"enum":
335     case tok!"export":
336     case tok!"extern":
337     case tok!"false":
338     case tok!"final":
339     case tok!"finally":
340     case tok!"for":
341     case tok!"foreach":
342     case tok!"foreach_reverse":
343     case tok!"function":
344     case tok!"goto":
345     case tok!"if":
346     case tok!"immutable":
347     case tok!"import":
348     case tok!"in":
349     case tok!"inout":
350     case tok!"interface":
351     case tok!"invariant":
352     case tok!"is":
353     case tok!"lazy":
354     case tok!"macro":
355     case tok!"mixin":
356     case tok!"module":
357     case tok!"new":
358     case tok!"nothrow":
359     case tok!"null":
360     case tok!"out":
361     case tok!"override":
362     case tok!"package":
363     case tok!"pragma":
364     case tok!"private":
365     case tok!"protected":
366     case tok!"public":
367     case tok!"pure":
368     case tok!"ref":
369     case tok!"return":
370     case tok!"scope":
371     case tok!"shared":
372     case tok!"static":
373     case tok!"struct":
374     case tok!"super":
375     case tok!"switch":
376     case tok!"synchronized":
377     case tok!"template":
378     case tok!"this":
379     case tok!"throw":
380     case tok!"true":
381     case tok!"try":
382     case tok!"typedef":
383     case tok!"typeid":
384     case tok!"typeof":
385     case tok!"union":
386     case tok!"unittest":
387     case tok!"version":
388     case tok!"volatile":
389     case tok!"while":
390     case tok!"with":
391     case tok!"__DATE__":
392     case tok!"__EOF__":
393     case tok!"__FILE__":
394     case tok!"__FUNCTION__":
395     case tok!"__gshared":
396     case tok!"__LINE__":
397     case tok!"__MODULE__":
398     case tok!"__parameters":
399     case tok!"__PRETTY_FUNCTION__":
400     case tok!"__TIME__":
401     case tok!"__TIMESTAMP__":
402     case tok!"__traits":
403     case tok!"__vector":
404     case tok!"__VENDOR__":
405     case tok!"__VERSION__":
406         return true;
407     default:
408         return false;
409     }
410 }
411 
412 /**
413  * Returns: true if the given ID type is for a string literal.
414  */
415 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc
416 {
417     switch (type)
418     {
419     case tok!"dstringLiteral":
420     case tok!"stringLiteral":
421     case tok!"wstringLiteral":
422         return true;
423     default:
424         return false;
425     }
426 }
427 
428 /**
429  * Returns: true if the given ID type is for a protection attribute.
430  */
431 public bool isProtection(IdType type) pure nothrow @safe @nogc
432 {
433     switch (type)
434     {
435     case tok!"export":
436     case tok!"package":
437     case tok!"private":
438     case tok!"public":
439     case tok!"protected":
440         return true;
441     default:
442         return false;
443     }
444 }
445 
446 /**
447  * Returns: an array of tokens lexed from the given source code to the output range. All
448  * whitespace tokens are skipped and comments are attached to the token nearest
449  * to them.
450  */
451 const(Token)[] getTokensForParser(ubyte[] sourceCode, LexerConfig config,
452     StringCache* cache)
453 {
454     enum CommentType : ubyte
455     {
456         notDoc,
457         line,
458         block
459     }
460 
461     static CommentType commentType(string comment) pure nothrow @safe
462     {
463         if (comment.length < 3)
464             return CommentType.notDoc;
465         if (comment[0 ..3] == "///")
466             return CommentType.line;
467         if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**")
468             return CommentType.block;
469         return CommentType.notDoc;
470     }
471 
472     config.whitespaceBehavior = WhitespaceBehavior.skip;
473 
474     auto output = appender!(typeof(return))();
475     auto lexer = DLexer(sourceCode, config, cache);
476     string blockComment;
477     size_t tokenCount;
478     loop: while (!lexer.empty) switch (lexer.front.type)
479     {
480     case tok!"specialTokenSequence":
481     case tok!"whitespace":
482         lexer.popFront();
483         break;
484     case tok!"comment":
485         final switch (commentType(lexer.front.text))
486         {
487         case CommentType.block:
488             if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
489             {
490                 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
491             }
492 	    else
493 	    {
494                 blockComment = lexer.front.text;
495 	    }
496             lexer.popFront();
497             break;
498         case CommentType.line:
499             if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
500             {
501                 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
502             }
503             else
504             {
505 	    	string c = lexer.front.text[3 .. $]; // just take the /// off entirely
506 		if(blockComment.length == 0) {
507 			blockComment = "/++" ~ c ~ "\n+/"; // just rewrite to this
508 		} else {
509 			import std.string;
510 			auto l = blockComment.lastIndexOf("\n");
511                         string replacement;
512 			if(l != -1) {
513                                 replacement = blockComment[l .. $];
514 				blockComment = blockComment[0 .. l + 1];
515 			} else {
516                                 replacement = blockComment[$-2 .. $];
517 				blockComment = blockComment[0 .. $-2]; // just cut off the */ or +/
518 			}
519 			if(blockComment[0 .. 3] == "/**")
520 				blockComment ~= c ~ replacement;
521 			else if(blockComment[0 .. 3] == "/++")
522 				blockComment ~= c ~ replacement;
523 			else assert(0);
524 
525 		}
526             }
527             lexer.popFront();
528             break;
529         case CommentType.notDoc:
530             lexer.popFront();
531             break;
532         }
533         break;
534     case tok!"__EOF__":
535         break loop;
536     default:
537         Token t = lexer.front;
538         lexer.popFront();
539         tokenCount++;
540         t.comment = blockComment;
541         blockComment = null;
542         output.put(t);
543         break;
544     }
545     return output.data;
546 }
547 
548 /**
549  * The D lexer struct.
550  */
551 public struct DLexer
552 {
553     mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
554         keywords, pseudoTokenHandlers);
555 
556     ///
557     @disable this();
558 
559     /**
560      * Params:
561      *     range = the bytes that compose the source code that will be lexed.
562      *     config = the lexer configuration to use.
563      *     cache = the string interning cache for de-duplicating identifiers and
564      *         other token text.
565      */
566     this(ubyte[] range, const LexerConfig config, StringCache* cache,
567         bool haveSSE42 = sse42()) pure nothrow @safe
568     {
569         this.haveSSE42 = haveSSE42;
570         auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
571             ? range[3 .. $] : range;
572         this.range = LexerRange(r);
573         this.config = config;
574         this.cache = cache;
575         popFront();
576     }
577 
578     ///
579     public void popFront()() pure nothrow @safe
580     {
581         do
582             _popFront();
583         while (config.whitespaceBehavior == WhitespaceBehavior.skip
584             && _front.type == tok!"whitespace");
585     }
586 
587 private pure nothrow @safe:
588 
589     bool isWhitespace()
590     {
591         switch (range.bytes[range.index])
592         {
593         case ' ':
594         case '\r':
595         case '\n':
596         case '\t':
597         case '\v':
598         case '\f':
599             return true;
600         case 0xe2:
601             auto peek = range.peek(2);
602             return peek.length == 2
603                 && peek[0] == 0x80
604                 && (peek[1] == 0xa8 || peek[1] == 0xa9);
605         default:
606             return false;
607         }
608     }
609 
610     void popFrontWhitespaceAware()
611     {
612         switch (range.bytes[range.index])
613         {
614         case '\r':
615             range.popFront();
616             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
617             {
618                 range.popFront();
619                 range.incrementLine();
620             }
621             else
622                 range.incrementLine();
623             return;
624         case '\n':
625             range.popFront();
626             range.incrementLine();
627             return;
628         case 0xe2:
629             auto lookahead = range.peek(3);
630             if (lookahead.length == 3 && lookahead[1] == 0x80
631                 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
632             {
633                 range.index+=3;
634                 range.column+=3;
635                 range.incrementLine();
636                 return;
637             }
638             else
639             {
640                 range.popFront();
641                 return;
642             }
643         default:
644             range.popFront();
645             return;
646         }
647     }
648 
649     void lexWhitespace(ref Token token) @trusted
650     {
651         mixin (tokenStart);
652         loop: do
653         {
654             version (iasm64NotWindows)
655             {
656                 if (haveSSE42 && range.index + 16 < range.bytes.length)
657                 {
658                     skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index,
659                         &range.index, &range.column);
660                 }
661             }
662             switch (range.bytes[range.index])
663             {
664             case '\r':
665                 range.popFront();
666                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
667                 {
668                     range.popFront();
669                 }
670                 range.column = 1;
671                 range.line += 1;
672                 break;
673             case '\n':
674                 range.popFront();
675                 range.column = 1;
676                 range.line += 1;
677                 break;
678             case ' ':
679             case '\t':
680             case '\v':
681             case '\f':
682                 range.popFront();
683                 break;
684             case 0xe2:
685                 if (range.index + 2 >= range.bytes.length)
686                     break loop;
687                 if (range.bytes[range.index + 1] != 0x80)
688                     break loop;
689                 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9)
690                 {
691                     range.index += 3;
692                     range.column += 3;
693                     range.column = 1;
694                     range.line += 1;
695                     break;
696                 }
697                 break loop;
698             default:
699                 break loop;
700             }
701         } while (!(range.index >= range.bytes.length));
702     end:
703         string text = config.whitespaceBehavior == WhitespaceBehavior.include
704             ? cache.intern(range.slice(mark)) : "";
705         token = Token(tok!"whitespace", text, line, column, index);
706     }
707 
708     void lexNumber(ref Token token)
709     {
710         mixin (tokenStart);
711         if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length)
712         {
713             auto ahead = range.bytes[range.index + 1];
714             switch (ahead)
715             {
716             case 'x':
717             case 'X':
718                 range.index += 2;
719                 range.column += 2;
720                 lexHex(token, mark, line, column, index);
721                 return;
722             case 'b':
723             case 'B':
724                 range.index += 2;
725                 range.column += 2;
726                 lexBinary(token, mark, line, column, index);
727                 return;
728             default:
729                 lexDecimal(token, mark, line, column, index);
730                 return;
731             }
732         }
733         else
734             lexDecimal(token, mark, line, column, index);
735     }
736 
737     void lexHex(ref Token token)
738     {
739         mixin (tokenStart);
740         lexHex(token, mark, line, column, index);
741     }
742 
743     void lexHex(ref Token token, size_t mark, size_t line, size_t column,
744         size_t index) @trusted
745     {
746         IdType type = tok!"intLiteral";
747         bool foundDot;
748         hexLoop: while (!(range.index >= range.bytes.length))
749         {
750             switch (range.bytes[range.index])
751             {
752             case 'a': .. case 'f':
753             case 'A': .. case 'F':
754             case '0': .. case '9':
755             case '_':
756                 version (iasm64NotWindows)
757                 {
758                     if (haveSSE42 && range.index + 16 < range.bytes.length)
759                     {
760                         immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_')
761                             (range.bytes.ptr + range.index);
762                         range.column += i;
763                         range.index += i;
764                     }
765                     else
766                         range.popFront();
767                 }
768                 else
769                     range.popFront();
770                 break;
771             case 'u':
772             case 'U':
773                 lexIntSuffix(type);
774                 break hexLoop;
775             case 'i':
776                 if (foundDot)
777                     lexFloatSuffix(type);
778                 break hexLoop;
779             case 'L':
780                 if (foundDot)
781                     lexFloatSuffix(type);
782                 else
783                     lexIntSuffix(type);
784                 break hexLoop;
785             case 'p':
786             case 'P':
787                 lexExponent(type);
788                 break hexLoop;
789             case '.':
790                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
791                     break hexLoop;
792                 else
793                 {
794                     // The following bit of silliness tries to tell the
795                     // difference between "int dot identifier" and
796                     // "double identifier".
797                     if ((range.index + 1 < range.bytes.length))
798                     {
799                         switch (range.peekAt(1))
800                         {
801                         case '0': .. case '9':
802                         case 'A': .. case 'F':
803                         case 'a': .. case 'f':
804                             goto doubleLiteral;
805                         default:
806                             break hexLoop;
807                         }
808                     }
809                     else
810                     {
811                     doubleLiteral:
812                         range.popFront();
813                         foundDot = true;
814                         type = tok!"doubleLiteral";
815                     }
816                 }
817                 break;
818             default:
819                 break hexLoop;
820             }
821         }
822         token = Token(type, cache.intern(range.slice(mark)), line, column,
823             index);
824     }
825 
826     void lexBinary(ref Token token)
827     {
828         mixin (tokenStart);
829         return lexBinary(token, mark, line, column, index);
830     }
831 
832     void lexBinary(ref Token token, size_t mark, size_t line, size_t column,
833         size_t index) @trusted
834     {
835         IdType type = tok!"intLiteral";
836         binaryLoop: while (!(range.index >= range.bytes.length))
837         {
838             switch (range.bytes[range.index])
839             {
840             case '0':
841             case '1':
842             case '_':
843                 version (iasm64NotWindows)
844                 {
845                     if (haveSSE42 && range.index + 16 < range.bytes.length)
846                     {
847                         immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')(
848                             range.bytes.ptr + range.index);
849                         range.column += i;
850                         range.index += i;
851                     }
852                     else
853                         range.popFront();
854                 }
855                 else
856                     range.popFront();
857                 break;
858             case 'u':
859             case 'U':
860             case 'L':
861                 lexIntSuffix(type);
862                 break binaryLoop;
863             default:
864                 break binaryLoop;
865             }
866         }
867         token = Token(type, cache.intern(range.slice(mark)), line, column,
868             index);
869     }
870 
871     void lexDecimal(ref Token token)
872     {
873         mixin (tokenStart);
874         lexDecimal(token, mark, line, column, index);
875     }
876 
877     void lexDecimal(ref Token token, size_t mark, size_t line, size_t column,
878         size_t index) @trusted
879     {
880         bool foundDot = range.bytes[range.index] == '.';
881         IdType type = tok!"intLiteral";
882         if (foundDot)
883         {
884             range.popFront();
885             type = tok!"doubleLiteral";
886         }
887 
888         decimalLoop: while (!(range.index >= range.bytes.length))
889         {
890             switch (range.bytes[range.index])
891             {
892             case '0': .. case '9':
893             case '_':
894                 version (iasm64NotWindows)
895                 {
896                     if (haveSSE42 && range.index + 16 < range.bytes.length)
897                     {
898                         ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index);
899                         range.column += i;
900                         range.index += i;
901                     }
902                     else
903                         range.popFront();
904                 }
905                 else
906                     range.popFront();
907                 break;
908             case 'u':
909             case 'U':
910                 if (!foundDot)
911                     lexIntSuffix(type);
912                 break decimalLoop;
913             case 'i':
914                 lexFloatSuffix(type);
915                 break decimalLoop;
916             case 'L':
917                 if (foundDot)
918                     lexFloatSuffix(type);
919                 else
920                     lexIntSuffix(type);
921                 break decimalLoop;
922             case 'f':
923             case 'F':
924                 lexFloatSuffix(type);
925                 break decimalLoop;
926             case 'e':
927             case 'E':
928                 lexExponent(type);
929                 break decimalLoop;
930             case '.':
931                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
932                     break decimalLoop;
933                 else
934                 {
935                     // The following bit of silliness tries to tell the
936                     // difference between "int dot identifier" and
937                     // "double identifier".
938                     if ((range.index + 1 < range.bytes.length))
939                     {
940                         auto ch = range.peekAt(1);
941                         if (ch <= 0x2f
942                             || (ch >= '0' && ch <= '9')
943                             || (ch >= ':' && ch <= '@')
944                             || (ch >= '[' && ch <= '^')
945                             || (ch >= '{' && ch <= '~')
946                             || ch == '`' || ch == '_')
947                         {
948                             goto doubleLiteral;
949                         }
950                         else
951                             break decimalLoop;
952                     }
953                     else
954                     {
955                     doubleLiteral:
956                         range.popFront();
957                         foundDot = true;
958                         type = tok!"doubleLiteral";
959                     }
960                 }
961                 break;
962             default:
963                 break decimalLoop;
964             }
965         }
966         token = Token(type, cache.intern(range.slice(mark)), line, column,
967             index);
968     }
969 
970     void lexIntSuffix(ref IdType type)
971     {
972         bool secondPass;
973         if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U')
974         {
975     U:
976             if (type == tok!"intLiteral")
977                 type = tok!"uintLiteral";
978             else
979                 type = tok!"ulongLiteral";
980             range.popFront();
981             if (secondPass)
982                 return;
983             if (range.index < range.bytes.length && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l'))
984                 goto L;
985             return;
986         }
987         if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')
988         {
989     L:
990             if (type == tok!"uintLiteral")
991                 type = tok!"ulongLiteral";
992             else
993                 type = tok!"longLiteral";
994             range.popFront();
995             if (!secondPass && range.index < range.bytes.length && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u'))
996             {
997                 secondPass = true;
998                 goto U;
999             }
1000             return;
1001         }
1002     }
1003 
1004     void lexFloatSuffix(ref IdType type) pure nothrow @safe
1005     {
1006         switch (range.bytes[range.index])
1007         {
1008         case 'L':
1009             range.popFront();
1010             type = tok!"doubleLiteral";
1011             break;
1012         case 'f':
1013         case 'F':
1014             range.popFront();
1015             type = tok!"floatLiteral";
1016             break;
1017         default:
1018             break;
1019         }
1020         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == 'i')
1021         {
1022             warning("Complex number literals are deprecated");
1023             range.popFront();
1024             if (type == tok!"floatLiteral")
1025                 type = tok!"ifloatLiteral";
1026             else
1027                 type = tok!"idoubleLiteral";
1028         }
1029     }
1030 
1031     void lexExponent(ref IdType type) pure nothrow @safe
1032     {
1033         range.popFront();
1034         bool foundSign = false;
1035         bool foundDigit = false;
1036         while (!(range.index >= range.bytes.length))
1037         {
1038             switch (range.bytes[range.index])
1039             {
1040             case '-':
1041             case '+':
1042                 if (foundSign)
1043                 {
1044                     if (!foundDigit)
1045                     error("Expected an exponent");
1046                     return;
1047                 }
1048                 foundSign = true;
1049                 range.popFront();
1050                 break;
1051             case '0': .. case '9':
1052             case '_':
1053                 foundDigit = true;
1054                 range.popFront();
1055                 break;
1056             case 'L':
1057             case 'f':
1058             case 'F':
1059             case 'i':
1060                 lexFloatSuffix(type);
1061                 return;
1062             default:
1063                 if (!foundDigit)
1064                     error("Expected an exponent");
1065                 return;
1066             }
1067         }
1068     }
1069 
1070     void lexScriptLine(ref Token token)
1071     {
1072         mixin (tokenStart);
1073         while (!(range.index >= range.bytes.length) && !isNewline)
1074         {
1075             range.popFront();
1076         }
1077         token = Token(tok!"scriptLine", cache.intern(range.slice(mark)),
1078             line, column, index);
1079     }
1080 
1081     void lexSpecialTokenSequence(ref Token token)
1082     {
1083         mixin (tokenStart);
1084         while (!(range.index >= range.bytes.length) && !isNewline)
1085         {
1086             range.popFront();
1087         }
1088         token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
1089             line, column, index);
1090     }
1091 
1092     void lexSlashStarComment(ref Token token) @trusted
1093     {
1094         mixin (tokenStart);
1095         IdType type = tok!"comment";
1096         range.popFrontN(2);
1097         while (range.index < range.bytes.length)
1098         {
1099             version (iasm64NotWindows)
1100             {
1101                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1102                     skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index,
1103                         &range.index, &range.column);
1104             }
1105             if (range.bytes[range.index] == '*')
1106             {
1107                 range.popFront();
1108                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1109                 {
1110                     range.popFront();
1111                     break;
1112                 }
1113             }
1114             else
1115                 popFrontWhitespaceAware();
1116         }
1117     end:
1118         token = Token(type, cache.intern(range.slice(mark)), line, column,
1119             index);
1120     }
1121 
1122     void lexSlashSlashComment(ref Token token) @trusted
1123     {
1124         mixin (tokenStart);
1125         IdType type = tok!"comment";
1126         range.popFrontN(2);
1127         while (range.index < range.bytes.length)
1128         {
1129             version (iasm64NotWindows)
1130             {
1131                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1132                 {
1133                     skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1134                         &range.index, &range.column);
1135                 }
1136             }
1137             if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n')
1138                 break;
1139             range.popFront();
1140         }
1141     end:
1142         token =  Token(type, cache.intern(range.slice(mark)), line, column,
1143             index);
1144     }
1145 
1146     void lexSlashPlusComment(ref Token token) @trusted
1147     {
1148         mixin (tokenStart);
1149         IdType type = tok!"comment";
1150         range.index += 2;
1151         range.column += 2;
1152         int depth = 1;
1153         while (depth > 0 && !(range.index >= range.bytes.length))
1154         {
1155             version (iasm64NotWindows)
1156             {
1157                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1158                 {
1159                     skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1160                         &range.index, &range.column);
1161                 }
1162             }
1163             if (range.bytes[range.index] == '+')
1164             {
1165                 range.popFront();
1166                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1167                 {
1168                     range.popFront();
1169                     depth--;
1170                 }
1171             }
1172             else if (range.bytes[range.index] == '/')
1173             {
1174                 range.popFront();
1175                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+')
1176                 {
1177                     range.popFront();
1178                     depth++;
1179                 }
1180             }
1181             else
1182                 popFrontWhitespaceAware();
1183         }
1184         token = Token(type, cache.intern(range.slice(mark)), line, column,
1185             index);
1186     }
1187 
1188     void lexStringLiteral(ref Token token) @trusted
1189     {
1190         mixin (tokenStart);
1191         range.popFront();
1192         while (true)
1193         {
1194             if ((range.index >= range.bytes.length))
1195             {
1196                 error("Error: unterminated string literal");
1197                 token = Token(tok!"");
1198                 return;
1199             }
1200             version (iasm64NotWindows)
1201             {
1202                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1203                 {
1204                     skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1205                         &range.index, &range.column);
1206                 }
1207             }
1208             if (range.bytes[range.index] == '"')
1209             {
1210                 range.popFront();
1211                 break;
1212             }
1213             else if (range.bytes[range.index] == '\\')
1214             {
1215                 lexEscapeSequence();
1216             }
1217             else
1218                 popFrontWhitespaceAware();
1219         }
1220         IdType type = tok!"stringLiteral";
1221         lexStringSuffix(type);
1222         token = Token(type, cache.intern(range.slice(mark)), line, column,
1223             index);
1224     }
1225 
1226     void lexWysiwygString(ref Token token) @trusted
1227     {
1228         mixin (tokenStart);
1229         IdType type = tok!"stringLiteral";
1230         bool backtick = range.bytes[range.index] == '`';
1231         if (backtick)
1232         {
1233             range.popFront();
1234             while (true)
1235             {
1236                 if ((range.index >= range.bytes.length))
1237                 {
1238                     error("Error: unterminated string literal");
1239                     token = Token(tok!"");
1240                     return;
1241                 }
1242                 version (iasm64NotWindows)
1243                 {
1244                     if (haveSSE42 && range.index + 16 < range.bytes.length)
1245                     {
1246                         skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index,
1247                             &range.index, &range.column);
1248                     }
1249                 }
1250                 if (range.bytes[range.index] == '`')
1251                 {
1252                     range.popFront();
1253                     break;
1254                 }
1255                 else
1256                     popFrontWhitespaceAware();
1257             }
1258         }
1259         else
1260         {
1261             range.popFront();
1262             if ((range.index >= range.bytes.length))
1263             {
1264                 error("Error: unterminated string literal");
1265                 token = Token(tok!"");
1266                 return;
1267             }
1268             range.popFront();
1269             while (true)
1270             {
1271                 if ((range.index >= range.bytes.length))
1272                 {
1273                     error("Error: unterminated string literal");
1274                     token = Token(tok!"");
1275                     return;
1276                 }
1277                 else if (range.bytes[range.index] == '"')
1278                 {
1279                     range.popFront();
1280                     break;
1281                 }
1282                 else
1283                     popFrontWhitespaceAware();
1284             }
1285         }
1286         lexStringSuffix(type);
1287         token = Token(type, cache.intern(range.slice(mark)), line, column,
1288             index);
1289     }
1290 
1291     private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe
1292     {
1293         if ((range.index >= range.bytes.length))
1294         {
1295             type = tok!"stringLiteral";
1296             return 0;
1297         }
1298         else
1299         {
1300             switch (range.bytes[range.index])
1301             {
1302             case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w';
1303             case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd';
1304             case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c';
1305             default: type = tok!"stringLiteral"; return 0;
1306             }
1307         }
1308     }
1309 
1310     void lexDelimitedString(ref Token token)
1311     {
1312         mixin (tokenStart);
1313         range.index += 2;
1314         range.column += 2;
1315         ubyte open;
1316         ubyte close;
1317         switch (range.bytes[range.index])
1318         {
1319         case '<':
1320             open = '<';
1321             close = '>';
1322             range.popFront();
1323             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1324             break;
1325         case '{':
1326             open = '{';
1327             close = '}';
1328             range.popFront();
1329             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1330             break;
1331         case '[':
1332             open = '[';
1333             close = ']';
1334             range.popFront();
1335             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1336             break;
1337         case '(':
1338             open = '(';
1339             close = ')';
1340             range.popFront();
1341             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1342             break;
1343         default:
1344             lexHeredocString(token, mark, line, column, index);
1345             break;
1346         }
1347     }
1348 
1349     void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column,
1350         size_t index, ubyte open, ubyte close)
1351     {
1352         int depth = 1;
1353         while (!(range.index >= range.bytes.length) && depth > 0)
1354         {
1355             if (range.bytes[range.index] == open)
1356             {
1357                 depth++;
1358                 range.popFront();
1359             }
1360             else if (range.bytes[range.index] == close)
1361             {
1362                 depth--;
1363                 range.popFront();
1364                 if (depth <= 0)
1365                 {
1366                     if (range.bytes[range.index] == '"')
1367                     {
1368                         range.popFront();
1369                     }
1370                     else
1371                     {
1372                         error("Error: \" expected to end delimited string literal");
1373                         token = Token(tok!"");
1374                         return;
1375                     }
1376                 }
1377             }
1378             else
1379                 popFrontWhitespaceAware();
1380         }
1381         IdType type = tok!"stringLiteral";
1382         lexStringSuffix(type);
1383         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1384     }
1385 
1386     void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index)
1387     {
1388         Token ident;
1389         lexIdentifier(ident);
1390         if (isNewline())
1391             popFrontWhitespaceAware();
1392         else
1393             error("Newline expected");
1394         while (!(range.index >= range.bytes.length))
1395         {
1396             if (isNewline())
1397             {
1398                 popFrontWhitespaceAware();
1399                 if (!range.canPeek(ident.text.length))
1400                 {
1401                     error(ident.text ~ " expected");
1402                     break;
1403                 }
1404                 if (range.peek(ident.text.length - 1) == ident.text)
1405                 {
1406                     range.popFrontN(ident.text.length);
1407                     break;
1408                 }
1409             }
1410             else
1411             {
1412                 range.popFront();
1413             }
1414         }
1415         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"')
1416         {
1417             range.popFront();
1418         }
1419         else
1420             error(`" expected`);
1421         IdType type = tok!"stringLiteral";
1422         lexStringSuffix(type);
1423         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1424     }
1425 
1426     void lexTokenString(ref Token token)
1427     {
1428         mixin (tokenStart);
1429         assert (range.bytes[range.index] == 'q');
1430         range.popFront();
1431         assert (range.bytes[range.index] == '{');
1432         range.popFront();
1433         auto app = appender!string();
1434         app.put("q{");
1435         int depth = 1;
1436 
1437         immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior;
1438         immutable StringBehavior oldString = config.stringBehavior;
1439         config.whitespaceBehavior = WhitespaceBehavior.include;
1440         config.stringBehavior = StringBehavior.source;
1441         scope (exit)
1442         {
1443             config.whitespaceBehavior = oldWhitespace;
1444             config.stringBehavior = oldString;
1445         }
1446 
1447         advance(_front);
1448         while (depth > 0 && !empty)
1449         {
1450             auto t = front();
1451             if (t.text is null)
1452                 app.put(str(t.type));
1453             else
1454                 app.put(t.text);
1455             if (t.type == tok!"}")
1456             {
1457                 depth--;
1458                 if (depth > 0)
1459                 popFront();
1460             }
1461             else if (t.type == tok!"{")
1462             {
1463                 depth++;
1464                 popFront();
1465             }
1466             else
1467                 popFront();
1468         }
1469         IdType type = tok!"stringLiteral";
1470         auto b = lexStringSuffix(type);
1471         if (b != 0)
1472             app.put(b);
1473         token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
1474             column, index);
1475     }
1476 
1477     void lexHexString(ref Token token)
1478     {
1479         mixin (tokenStart);
1480         range.index += 2;
1481         range.column += 2;
1482 
1483         loop: while (true)
1484         {
1485             if ((range.index >= range.bytes.length))
1486             {
1487                 error("Error: unterminated hex string literal");
1488                 token = Token(tok!"");
1489                 return;
1490             }
1491             else if (isWhitespace())
1492                 popFrontWhitespaceAware();
1493             else switch (range.bytes[range.index])
1494             {
1495             case '0': .. case '9':
1496             case 'A': .. case 'F':
1497             case 'a': .. case 'f':
1498                 range.popFront();
1499                 break;
1500             case '"':
1501                 range.popFront();
1502                 break loop;
1503             default:
1504                 error("Error: invalid character in hex string");
1505                 token = Token(tok!"");
1506                 return;
1507             }
1508         }
1509 
1510         IdType type = tok!"stringLiteral";
1511         lexStringSuffix(type);
1512         token = Token(type, cache.intern(range.slice(mark)), line, column,
1513             index);
1514     }
1515 
1516     bool lexEscapeSequence()
1517     {
1518         range.popFront();
1519         if ((range.index >= range.bytes.length))
1520         {
1521             error("Error: non-terminated character escape sequence.");
1522             return false;
1523         }
1524         switch (range.bytes[range.index])
1525         {
1526         case '\'':
1527         case '"':
1528         case '?':
1529         case '\\':
1530         case 'a':
1531         case 'b':
1532         case 'f':
1533         case 'n':
1534         case 'r':
1535         case 't':
1536         case 'v':
1537             range.popFront();
1538             break;
1539         case 'x':
1540             range.popFront();
1541             foreach (i; 0 .. 2)
1542             {
1543                 if ((range.index >= range.bytes.length))
1544                 {
1545                     error("Error: 2 hex digits expected.");
1546                     return false;
1547                 }
1548                 switch (range.bytes[range.index])
1549                 {
1550                 case '0': .. case '9':
1551                 case 'a': .. case 'f':
1552                 case 'A': .. case 'F':
1553                     range.popFront();
1554                     break;
1555                 default:
1556                     error("Error: 2 hex digits expected.");
1557                     return false;
1558                 }
1559             }
1560             break;
1561         case '0':
1562             if (!(range.index + 1 < range.bytes.length) || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\''))
1563             {
1564                 range.popFront();
1565                 break;
1566             }
1567             goto case;
1568         case '1': .. case '7':
1569             for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++)
1570                 range.popFront();
1571             break;
1572         case 'u':
1573             range.popFront();
1574             foreach (i; 0 .. 4)
1575             {
1576                 if ((range.index >= range.bytes.length))
1577                 {
1578                     error("Error: at least 4 hex digits expected.");
1579                     return false;
1580                 }
1581                 switch (range.bytes[range.index])
1582                 {
1583                 case '0': .. case '9':
1584                 case 'a': .. case 'f':
1585                 case 'A': .. case 'F':
1586                     range.popFront();
1587                     break;
1588                 default:
1589                     error("Error: at least 4 hex digits expected.");
1590                     return false;
1591                 }
1592             }
1593             break;
1594         case 'U':
1595             range.popFront();
1596             foreach (i; 0 .. 8)
1597             {
1598                 if ((range.index >= range.bytes.length))
1599                 {
1600                     error("Error: at least 8 hex digits expected.");
1601                     return false;
1602                 }
1603                 switch (range.bytes[range.index])
1604                 {
1605                 case '0': .. case '9':
1606                 case 'a': .. case 'f':
1607                 case 'A': .. case 'F':
1608                     range.popFront();
1609                     break;
1610                 default:
1611                     error("Error: at least 8 hex digits expected.");
1612                     return false;
1613                 }
1614             }
1615             break;
1616         default:
1617             while (true)
1618             {
1619                 if ((range.index >= range.bytes.length))
1620                 {
1621                     error("Error: non-terminated character escape sequence.");
1622                     return false;
1623                 }
1624                 if (range.bytes[range.index] == ';')
1625                 {
1626                     range.popFront();
1627                     break;
1628                 }
1629                 else
1630                 {
1631                     range.popFront();
1632                 }
1633             }
1634         }
1635         return true;
1636     }
1637 
1638     void lexCharacterLiteral(ref Token token)
1639     {
1640         mixin (tokenStart);
1641         range.popFront();
1642         if (range.bytes[range.index] == '\\')
1643         {
1644             lexEscapeSequence();
1645             goto close;
1646         }
1647         else if (range.bytes[range.index] == '\'')
1648         {
1649             range.popFront();
1650             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1651                 line, column, index);
1652         }
1653         else if (range.bytes[range.index] & 0x80)
1654         {
1655             while (range.bytes[range.index] & 0x80)
1656             {
1657                 range.popFront();
1658             }
1659             goto close;
1660         }
1661         else
1662         {
1663             popFrontWhitespaceAware();
1664             goto close;
1665         }
1666     close:
1667         if (range.index < range.bytes.length && range.bytes[range.index] == '\'')
1668         {
1669             range.popFront();
1670             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1671                 line, column, index);
1672         }
1673         else
1674         {
1675             error("Error: Expected ' to end character literal");
1676             token = Token(tok!"");
1677         }
1678     }
1679 
1680     void lexIdentifier(ref Token token) @trusted
1681     {
1682         mixin (tokenStart);
1683         if (isSeparating(0))
1684         {
1685             error("Invalid identifier");
1686             range.popFront();
1687         }
1688         while (true)
1689         {
1690             version (iasm64NotWindows)
1691             {
1692                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1693                 {
1694                     immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_')
1695                         (range.bytes.ptr + range.index);
1696                     range.column += i;
1697                     range.index += i;
1698                 }
1699             }
1700             if (isSeparating(0))
1701                 break;
1702             else
1703                 range.popFront();
1704         }
1705         token = Token(tok!"identifier", cache.intern(range.slice(mark)), line,
1706             column, index);
1707     }
1708 
1709     void lexDot(ref Token token)
1710     {
1711         mixin (tokenStart);
1712         if (!(range.index + 1 < range.bytes.length))
1713         {
1714             range.popFront();
1715             token = Token(tok!".", null, line, column, index);
1716             return;
1717         }
1718         switch (range.peekAt(1))
1719         {
1720         case '0': .. case '9':
1721             lexNumber(token);
1722             return;
1723         case '.':
1724             range.popFront();
1725             range.popFront();
1726             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.')
1727             {
1728                 range.popFront();
1729                 token = Token(tok!"...", null, line, column, index);
1730             }
1731             else
1732                 token = Token(tok!"..", null, line, column, index);
1733             return;
1734         default:
1735             range.popFront();
1736             token = Token(tok!".", null, line, column, index);
1737             return;
1738         }
1739     }
1740 
1741     void lexLongNewline(ref Token token) @nogc
1742     {
1743         mixin (tokenStart);
1744         range.popFront();
1745         range.popFront();
1746         range.popFront();
1747         range.incrementLine();
1748         string text = config.whitespaceBehavior == WhitespaceBehavior.include
1749             ? cache.intern(range.slice(mark)) : "";
1750         token = Token(tok!"whitespace", text, line,
1751             column, index);
1752     }
1753 
1754     bool isNewline() @nogc
1755     {
1756         if (range.bytes[range.index] == '\n') return true;
1757         if (range.bytes[range.index] == '\r') return true;
1758         return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length)
1759             && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
1760     }
1761 
1762     bool isSeparating(size_t offset) @nogc
1763     {
1764         enum : ubyte
1765         {
1766             n, y, m // no, yes, maybe
1767         }
1768 
1769         if (range.index + offset >= range.bytes.length)
1770             return true;
1771         auto c = range.bytes[range.index + offset];
1772         static immutable ubyte[256] LOOKUP_TABLE = [
1773             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1774             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1775             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1776             n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y,
1777             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1778             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n,
1779             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1780             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y,
1781             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1782             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1783             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1784             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1785             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1786             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1787             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1788             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m
1789         ];
1790         immutable ubyte result = LOOKUP_TABLE[c];
1791         if (result == n)
1792             return false;
1793         if (result == y)
1794             return true;
1795         if (result == m)
1796         {
1797             auto r = range;
1798             range.popFrontN(offset);
1799             return (r.canPeek(2) && (r.peek(2) == "\u2028"
1800                 || r.peek(2) == "\u2029"));
1801         }
1802         assert (false);
1803     }
1804 
1805 
1806 
1807     enum tokenStart = q{
1808         size_t index = range.index;
1809         size_t column = range.column;
1810         size_t line = range.line;
1811         auto mark = range.mark();
1812     };
1813 
1814     void error(string message)
1815     {
1816         messages ~= Message(range.line, range.column, message, true);
1817     }
1818 
1819     void warning(string message)
1820     {
1821         messages ~= Message(range.line, range.column, message, false);
1822         assert (messages.length > 0);
1823     }
1824 
1825     static struct Message
1826     {
1827         size_t line;
1828         size_t column;
1829         string message;
1830         bool isError;
1831     }
1832 
1833     Message[] messages;
1834     StringCache* cache;
1835     LexerConfig config;
1836     bool haveSSE42;
1837 }
1838 
1839 /**
1840  * Creates a token range from the given source code. Creates a default lexer
1841  * configuration and a GC-managed string cache.
1842  */
1843 public auto byToken(ubyte[] range)
1844 {
1845     LexerConfig config;
1846     StringCache* cache = new StringCache(StringCache.defaultBucketCount);
1847     return DLexer(range, config, cache);
1848 }
1849 
1850 /**
1851  * Creates a token range from the given source code. Uses the given string
1852  * cache.
1853  */
1854 public auto byToken(ubyte[] range, StringCache* cache)
1855 {
1856     LexerConfig config;
1857     return DLexer(range, config, cache);
1858 }
1859 
1860 /**
1861  * Creates a token range from the given source code. Uses the provided lexer
1862  * configuration and string cache.
1863  */
1864 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
1865 {
1866     return DLexer(range, config, cache);
1867 }
1868 
1869 /**
1870  * Removes "decoration" such as leading whitespace, leading + and * characters,
1871  * and places the result into the given output range
1872  */
1873 public void unDecorateComment(T)(string comment, auto ref T outputRange)
1874     if (isOutputRange!(T, string))
1875 in
1876 {
1877     assert (comment.length >= 3);
1878 }
1879 do
1880 {
1881     switch (comment[0 .. 3])
1882     {
1883     case "///":
1884         size_t i = 3;
1885         if (i < comment.length)
1886         {
1887         again:
1888             while (i < comment.length && (comment[i] == ' ' || comment[i] == '\t'))
1889                 i++;
1890             size_t j = i + 1;
1891             while (j < comment.length)
1892             {
1893                 if (comment[j] == '\r')
1894                     j++;
1895                 if (j >= comment.length)
1896                     break;
1897                 if (comment[j] == '\n')
1898                 {
1899                     outputRange.put(comment[i .. j]);
1900                     j++;
1901                     while (j < comment.length && comment[j] == '/')
1902                         j++;
1903                     outputRange.put('\n');
1904                     i = j;
1905                     goto again;
1906                 }
1907                 j++;
1908             }
1909             if (i < comment.length && j <= comment.length)
1910                 outputRange.put(comment[i .. j]);
1911         }
1912         break;
1913     case "/++":
1914     case "/**":
1915         size_t i = 3;
1916         immutable char c = comment[1];
1917         // Skip leading * and + characters
1918         while (comment[i] == c) i++;
1919         // Skip trailing * and + characters
1920         size_t j = comment.length - 2;
1921         while (j > i && comment[j] == c)
1922             j--;
1923         while (j > i && (comment[j] == ' ' || comment[j] == '\t'))
1924             j--;
1925         j++;
1926         size_t k = i;
1927         while (k < j)
1928         {
1929             if (comment[k] == '\n')
1930             {
1931                 k++;
1932                 break;
1933             }
1934             k++;
1935         }
1936         outputRange.put(comment[i .. k]);
1937         i = k;
1938         if (comment[i] == '\r') i++;
1939         if (comment[i] == '\n') i++;
1940         while (comment[i] == ' ' || comment[i] == '\t') i++;
1941         immutable bool skipBeginningChar = comment[i] == c;
1942         if (skipBeginningChar)
1943             i++;
1944         size_t whitespaceToSkip;
1945         while (comment[i] == ' ' || comment[i] == '\t')
1946         {
1947             whitespaceToSkip++;
1948             i++;
1949         }
1950         size_t l = i;
1951         while (i < j)
1952         {
1953             if (comment[i++] == '\n')
1954                 break;
1955         }
1956         outputRange.put(comment[l .. i]);
1957         while (true)
1958         {
1959             if (skipBeginningChar)
1960             {
1961                 while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++;
1962                 if (i < j && comment[i] == c) i++;
1963             }
1964             for (size_t s = 0; (i < j) && (s < whitespaceToSkip)
1965                 && (comment[i] == ' ' || comment[i] == '\t');)
1966             {
1967                 s++;
1968                 i++;
1969             }
1970             k = i;
1971             inner: while (k < j)
1972             {
1973                 if (comment[k] == '\n')
1974                 {
1975                     k++;
1976                     break inner;
1977                 }
1978                 k++;
1979             }
1980             outputRange.put(comment[i .. k]);
1981             i = k;
1982             if (i >= j)
1983                 break;
1984         }
1985         break;
1986     default:
1987         outputRange.put(comment);
1988         break;
1989     }
1990 }
1991 
1992 
1993 /**
1994  * The string cache is used for string interning.
1995  *
1996  * It will only store a single copy of any string that it is asked to hold.
1997  * Interned strings can be compared for equality by comparing their $(B .ptr)
1998  * field.
1999  *
2000  * Default and postbilt constructors are disabled. When a StringCache goes out
2001  * of scope, the memory held by it is freed.
2002  *
2003  * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning)
2004  */
2005 struct StringCache
2006 {
2007 public pure nothrow @nogc:
2008 
2009     @disable this();
2010     @disable this(this);
2011 
2012     /**
2013      * Params: bucketCount = the initial number of buckets. Must be a
2014      * power of two
2015      */
2016     this(size_t bucketCount) nothrow @trusted @nogc
2017     in
2018     {
2019         import core.bitop : popcnt;
2020         static if (size_t.sizeof == 8)
2021         {
2022             immutable low = popcnt(cast(uint) bucketCount);
2023             immutable high = popcnt(cast(uint) (bucketCount >> 32));
2024             assert ((low == 0 && high == 1) || (low == 1 && high == 0));
2025         }
2026         else
2027         {
2028             static assert (size_t.sizeof == 4);
2029             assert (popcnt(cast(uint) bucketCount) == 1);
2030         }
2031     }
2032     do
2033     {
2034         buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
2035     }
2036 
2037     void freeItAll()
2038     {
2039         Block* current = rootBlock;
2040         while (current !is null)
2041         {
2042             Block* prev = current;
2043             current = current.next;
2044             free(cast(void*) prev);
2045         }
2046         foreach (nodePointer; buckets)
2047         {
2048             Node* currentNode = nodePointer;
2049             while (currentNode !is null)
2050             {
2051                 if (currentNode.mallocated)
2052                     free(currentNode.str.ptr);
2053                 Node* prev = currentNode;
2054                 currentNode = currentNode.next;
2055                 free(prev);
2056             }
2057         }
2058         rootBlock = null;
2059         free(buckets.ptr);
2060         buckets = null;
2061     }
2062 
2063     /**
2064      * Caches a string.
2065      */
2066     string intern(const(ubyte)[] str) @safe
2067     {
2068         if (str is null || str.length == 0)
2069             return "";
2070         return _intern(str);
2071     }
2072 
2073     /**
2074      * ditto
2075      */
2076     string intern(string str) @trusted
2077     {
2078         return intern(cast(ubyte[]) str);
2079     }
2080 
2081     /**
2082      * The default bucket count for the string cache.
2083      */
2084     static enum defaultBucketCount = 4096;
2085 
2086 private:
2087 
2088     string _intern(const(ubyte)[] bytes) @trusted
2089     {
2090         immutable uint hash = hashBytes(bytes);
2091         immutable size_t index = hash & (buckets.length - 1);
2092         Node* s = find(bytes, hash);
2093         if (s !is null)
2094             return cast(string) s.str;
2095         ubyte[] mem = void;
2096         bool mallocated = bytes.length > BIG_STRING;
2097         if (mallocated)
2098             mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length];
2099         else
2100             mem = allocate(bytes.length);
2101         mem[] = bytes[];
2102         Node* node = cast(Node*) malloc(Node.sizeof);
2103         node.str = mem;
2104         node.hash = hash;
2105         node.next = buckets[index];
2106         node.mallocated = mallocated;
2107         buckets[index] = node;
2108         return cast(string) mem;
2109     }
2110 
2111     Node* find(const(ubyte)[] bytes, uint hash) @trusted
2112     {
2113         import std.algorithm : equal;
2114         immutable size_t index = hash & (buckets.length - 1);
2115         Node* node = buckets[index];
2116         while (node !is null)
2117         {
2118             if (node.hash == hash && bytes == cast(ubyte[]) node.str)
2119                 return node;
2120             node = node.next;
2121         }
2122         return node;
2123     }
2124 
2125     static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc
2126     in
2127     {
2128         assert (data !is null);
2129         assert (data.length > 0);
2130     }
2131     do
2132     {
2133         immutable uint m = 0x5bd1e995;
2134         immutable int r = 24;
2135         uint h = cast(uint) data.length;
2136         while (data.length >= 4)
2137         {
2138             uint k = (cast(ubyte) data[3]) << 24
2139                 | (cast(ubyte) data[2]) << 16
2140                 | (cast(ubyte) data[1]) << 8
2141                 | (cast(ubyte) data[0]);
2142             k *= m;
2143             k ^= k >> r;
2144             k *= m;
2145             h *= m;
2146             h ^= k;
2147             data = data[4 .. $];
2148         }
2149         switch (data.length & 3)
2150         {
2151         case 3:
2152             h ^= data[2] << 16;
2153             goto case;
2154         case 2:
2155             h ^= data[1] << 8;
2156             goto case;
2157         case 1:
2158             h ^= data[0];
2159             h *= m;
2160             break;
2161         default:
2162             break;
2163         }
2164         h ^= h >> 13;
2165         h *= m;
2166         h ^= h >> 15;
2167         return h;
2168     }
2169 
2170     ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc
2171     in
2172     {
2173         assert (numBytes != 0);
2174     }
2175     out (result)
2176     {
2177         assert (result.length == numBytes);
2178     }
2179     do
2180     {
2181         Block* r = rootBlock;
2182         size_t i = 0;
2183         while  (i <= 3 && r !is null)
2184         {
2185             immutable size_t available = r.bytes.length;
2186             immutable size_t oldUsed = r.used;
2187             immutable size_t newUsed = oldUsed + numBytes;
2188             if (newUsed <= available)
2189             {
2190                 r.used = newUsed;
2191                 return r.bytes[oldUsed .. newUsed];
2192             }
2193             i++;
2194             r = r.next;
2195         }
2196         Block* b = cast(Block*) calloc(Block.sizeof, 1);
2197         b.used = numBytes;
2198         b.next = rootBlock;
2199         rootBlock = b;
2200         return b.bytes[0 .. numBytes];
2201     }
2202 
2203     static struct Node
2204     {
2205         ubyte[] str = void;
2206         Node* next = void;
2207         uint hash = void;
2208         bool mallocated = void;
2209     }
2210 
2211     static struct Block
2212     {
2213         Block* next;
2214         size_t used;
2215         enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof;
2216         ubyte[BLOCK_CAPACITY] bytes;
2217     }
2218 
2219     static assert (BLOCK_SIZE == Block.sizeof);
2220 
2221     enum BLOCK_SIZE = 1024 * 16;
2222 
2223     // If a string would take up more than 1/4 of a block, allocate it outside
2224     // of the block.
2225     enum BIG_STRING = BLOCK_SIZE / 4;
2226 
2227     Node*[] buckets;
2228     Block* rootBlock;
2229 }
2230 
2231 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted;
2232 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted;
2233 private extern(C) void free(void*) nothrow pure @nogc @trusted;
2234 
2235 unittest
2236 {
2237     auto source = cast(ubyte[]) q{ import std.stdio;}};
2238     auto tokens = getTokensForParser(source, LexerConfig(),
2239         new StringCache(StringCache.defaultBucketCount));
2240     assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
2241         tok!"identifier", tok!";"]));
2242 }
2243 
2244 /// Test \x char sequence
2245 unittest
2246 {
2247     auto toks = (string s) => byToken(cast(ubyte[])s);
2248 
2249     // valid
2250     enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
2251     auto source = "";
2252     foreach (h1; hex)
2253         foreach (h2; hex)
2254             source ~= "'\\x" ~ h1 ~ h2 ~ "'";
2255     assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
2256 
2257     // invalid
2258     assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2259     assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2260     assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2261     assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2262     assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2263 }
2264 
2265 version (iasm64NotWindows)
2266 {
2267     /**
2268      * Returns:
2269      */
2270     ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc
2271     {
2272         asm pure nothrow @nogc
2273         {
2274             naked;
2275             movdqu XMM1, [RDI];
2276             mov RAX, 3;
2277             mov RDX, 16;
2278             mov R8, 0x0d0d0d0d0d0d0d0dL;
2279             movq XMM2, R8;
2280             shufpd XMM2, XMM2, 0;
2281             pcmpeqb XMM2, XMM1;
2282             mov R9, 0x0a0a0a0a0a0a0a0aL;
2283             movq XMM3, R9;
2284             shufpd XMM3, XMM3, 0;
2285             pcmpeqb XMM3, XMM1;
2286             mov R10, 0xe280a8L;
2287             movq XMM4, R10;
2288             pcmpestrm XMM4, XMM1, 0b01001100;
2289             movdqa XMM4, XMM0;
2290             mov R11, 0xe280a9L;
2291             movq XMM5, R11;
2292             pcmpestrm XMM5, XMM1, 0b01001100;
2293             movdqa XMM5, XMM0;
2294             mov RCX, 0x0a0d;
2295             dec RAX;
2296             movq XMM6, RCX;
2297             pcmpestrm XMM6, XMM1, 0b01001100;
2298             movdqa XMM6, XMM0;
2299             movdqa XMM7, XMM6;
2300             pslldq XMM7, 1;
2301             movdqa XMM0, XMM4;
2302             por XMM0, XMM5;
2303             por XMM7, XMM6;
2304             movdqa XMM1, XMM2;
2305             por XMM1, XMM3;
2306             pxor XMM7, XMM1;
2307             por XMM7, XMM0;
2308             por XMM7, XMM6;
2309             pmovmskb RAX, XMM7;
2310             and RAX, 0b0011_1111_1111_1111;
2311             ret;
2312         }
2313     }
2314 
2315     /**
2316      * Skips between 0 and 16 bytes that match (or do not match) one of the
2317      * given $(B chars).
2318      */
2319     void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow
2320         @trusted @nogc if (chars.length <= 8)
2321     {
2322         enum constant = ByteCombine!chars;
2323         enum charsLength = chars.length;
2324         static if (matching)
2325             enum flags = 0b0001_0000;
2326         else
2327             enum flags = 0b0000_0000;
2328         asm pure nothrow @nogc
2329         {
2330             naked;
2331             movdqu XMM1, [RDX];
2332             mov R10, constant;
2333             movq XMM2, R10;
2334             mov RAX, charsLength;
2335             mov RDX, 16;
2336             pcmpestri XMM2, XMM1, flags;
2337             add [RSI], RCX;
2338             add [RDI], RCX;
2339             ret;
2340         }
2341     }
2342 
2343     /**
2344      * Returns: the number of bytes starting at the given location that match
2345      *     (or do not match if $(B invert) is true) the byte ranges in $(B chars).
2346      */
2347     ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc
2348     {
2349         static assert (chars.length % 2 == 0);
2350         enum constant = ByteCombine!chars;
2351         static if (invert)
2352             enum rangeMatchFlags = 0b0000_0100;
2353         else
2354             enum rangeMatchFlags = 0b0001_0100;
2355         enum charsLength = chars.length;
2356         asm pure nothrow @nogc
2357         {
2358             naked;
2359             movdqu XMM1, [RDI];
2360             mov R10, constant;
2361             movq XMM2, R10;
2362             mov RAX, charsLength;
2363             mov RDX, 16;
2364             pcmpestri XMM2, XMM1, rangeMatchFlags;
2365             mov RAX, RCX;
2366             ret;
2367         }
2368     }
2369 
2370     template ByteCombine(c...)
2371     {
2372         static assert (c.length <= 8);
2373         static if (c.length > 1)
2374             enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8);
2375         else
2376             enum ulong ByteCombine = c[0];
2377     }
2378 }