dparse.lexer source code

1 module dparse.lexer;
2 
3 import std.typecons;
4 import std.typetuple;
5 import std.array;
6 import std.algorithm;
7 import std.range;
8 import std.experimental.lexer;
9 import core.cpuid : sse42;
10 version (D_InlineAsm_X86_64)
11 {
12     version (Windows) {}
13     else version = iasm64NotWindows;
14 }
15 
16 /// Operators
17 private enum operators = [
18     ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
19     "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
20     "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
21     "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
22     "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
23 ];
24 
25 /// Kewords
26 private enum keywords = [
27     "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool",
28     "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
29     "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
30     "delegate", "delete", "deprecated", "do", "double", "else", "enum",
31     "export", "extern", "false", "final", "finally", "float", "for", "foreach",
32     "foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
33     "immutable", "import", "in", "inout", "int", "interface", "invariant",
34     "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
35     "null", "out", "override", "package", "pragma", "private", "protected",
36     "public", "pure", "real", "ref", "return", "scope", "shared", "short",
37     "static", "struct", "super", "switch", "synchronized", "template", "this",
38     "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
39     "uint", "ulong", "union", "unittest", "ushort", "version", "void",
40     "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
41     "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters",
42     "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector",
43     "__VENDOR__", "__VERSION__"
44 ];
45 
46 /// Other tokens
47 private enum dynamicTokens = [
48     "specialTokenSequence", "comment", "identifier", "scriptLine",
49     "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
50     "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
51     "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
52     "dstringLiteral", "stringLiteral", "wstringLiteral"
53 ];
54 
55 private enum pseudoTokenHandlers = [
56     "\"", "lexStringLiteral",
57     "`", "lexWysiwygString",
58     "//", "lexSlashSlashComment",
59     "/*", "lexSlashStarComment",
60     "/+", "lexSlashPlusComment",
61     ".", "lexDot",
62     "'", "lexCharacterLiteral",
63     "0", "lexNumber",
64     "1", "lexDecimal",
65     "2", "lexDecimal",
66     "3", "lexDecimal",
67     "4", "lexDecimal",
68     "5", "lexDecimal",
69     "6", "lexDecimal",
70     "7", "lexDecimal",
71     "8", "lexDecimal",
72     "9", "lexDecimal",
73     "q\"", "lexDelimitedString",
74     "q{", "lexTokenString",
75     "r\"", "lexWysiwygString",
76     "x\"", "lexHexString",
77     " ", "lexWhitespace",
78     "\t", "lexWhitespace",
79     "\r", "lexWhitespace",
80     "\n", "lexWhitespace",
81     "\v", "lexWhitespace",
82     "\f", "lexWhitespace",
83     "\u2028", "lexLongNewline",
84     "\u2029", "lexLongNewline",
85     "#!", "lexScriptLine",
86     "#line", "lexSpecialTokenSequence"
87 ];
88 
89 /// Token ID type for the D lexer.
90 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
91 
92 /**
93  * Function used for converting an IdType to a string.
94  *
95  * Examples:
96  * ---
97  * IdType c = tok!"case";
98  * assert (str(c) == "case");
99  * ---
100  */
101 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
102 
103 /**
104  * Template used to refer to D token types.
105  *
106  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
107  * values that can be passed to this template.
108  * Example:
109  * ---
110  * import dparse.lexer;
111  * IdType t = tok!"floatLiteral";
112  * ---
113  */
114 public template tok(string token)
115 {
116     alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
117 }
118 
119 private enum extraFields = q{
120     string comment;
121     string trailingComment;
122 
123     int opCmp(size_t i) const pure nothrow @safe {
124         if (index < i) return -1;
125         if (index > i) return 1;
126         return 0;
127     }
128 
129     int opCmp(ref const typeof(this) other) const pure nothrow @safe {
130         return opCmp(other.index);
131     }
132 };
133 
134 /// The token type in the D lexer
135 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields);
136 
137 /**
138  * Configure whitespace handling
139  */
140 public enum WhitespaceBehavior : ubyte
141 {
142     include = 0b0000_0000,
143     skip = 0b0000_0001,
144 }
145 
146 /**
147  * Configure string lexing behavior
148  */
149 public enum StringBehavior : ubyte
150 {
151     /// Do not include quote characters, process escape sequences
152     compiler = 0b0000_0000,
153     /// Opening quotes, closing quotes, and string suffixes are included in the
154     /// string token
155     includeQuoteChars = 0b0000_0001,
156     /// String escape sequences are not replaced
157     notEscaped = 0b0000_0010,
158     /// Not modified at all. Useful for formatters or highlighters
159     source = includeQuoteChars | notEscaped
160 }
161 
162 /**
163  * Lexer configuration struct
164  */
165 public struct LexerConfig
166 {
167     string fileName;
168     StringBehavior stringBehavior;
169     WhitespaceBehavior whitespaceBehavior;
170 }
171 
172 /**
173  * Returns: true if the given ID is for a basic type.
174  */
175 public bool isBasicType(IdType type) nothrow pure @safe @nogc
176 {
177     switch (type)
178     {
179     case tok!"int":
180     case tok!"uint":
181     case tok!"double":
182     case tok!"idouble":
183     case tok!"float":
184     case tok!"ifloat":
185     case tok!"short":
186     case tok!"ushort":
187     case tok!"long":
188     case tok!"ulong":
189     case tok!"char":
190     case tok!"wchar":
191     case tok!"dchar":
192     case tok!"bool":
193     case tok!"void":
194     case tok!"cent":
195     case tok!"ucent":
196     case tok!"real":
197     case tok!"ireal":
198     case tok!"byte":
199     case tok!"ubyte":
200     case tok!"cdouble":
201     case tok!"cfloat":
202     case tok!"creal":
203         return true;
204     default:
205         return false;
206     }
207 }
208 
209 /**
210  * Returns: true if the given ID type is for a number literal.
211  */
212 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc
213 {
214     switch (type)
215     {
216     case tok!"doubleLiteral":
217     case tok!"floatLiteral":
218     case tok!"idoubleLiteral":
219     case tok!"ifloatLiteral":
220     case tok!"intLiteral":
221     case tok!"longLiteral":
222     case tok!"realLiteral":
223     case tok!"irealLiteral":
224     case tok!"uintLiteral":
225     case tok!"ulongLiteral":
226         return true;
227     default:
228         return false;
229     }
230 }
231 
232 /**
233  * Returns: true if the given ID type is for an operator.
234  */
235 public bool isOperator(IdType type) nothrow pure @safe @nogc
236 {
237     switch (type)
238     {
239     case tok!",":
240     case tok!".":
241     case tok!"..":
242     case tok!"...":
243     case tok!"/":
244     case tok!"/=":
245     case tok!"!":
246     case tok!"!<":
247     case tok!"!<=":
248     case tok!"!<>":
249     case tok!"!<>=":
250     case tok!"!=":
251     case tok!"!>":
252     case tok!"!>=":
253     case tok!"$":
254     case tok!"%":
255     case tok!"%=":
256     case tok!"&":
257     case tok!"&&":
258     case tok!"&=":
259     case tok!"(":
260     case tok!")":
261     case tok!"*":
262     case tok!"*=":
263     case tok!"+":
264     case tok!"++":
265     case tok!"+=":
266     case tok!"-":
267     case tok!"--":
268     case tok!"-=":
269     case tok!":":
270     case tok!";":
271     case tok!"<":
272     case tok!"<<":
273     case tok!"<<=":
274     case tok!"<=":
275     case tok!"<>":
276     case tok!"<>=":
277     case tok!"=":
278     case tok!"==":
279     case tok!"=>":
280     case tok!">":
281     case tok!">=":
282     case tok!">>":
283     case tok!">>=":
284     case tok!">>>":
285     case tok!">>>=":
286     case tok!"?":
287     case tok!"@":
288     case tok!"[":
289     case tok!"]":
290     case tok!"^":
291     case tok!"^=":
292     case tok!"^^":
293     case tok!"^^=":
294     case tok!"{":
295     case tok!"|":
296     case tok!"|=":
297     case tok!"||":
298     case tok!"}":
299     case tok!"~":
300     case tok!"~=":
301         return true;
302     default:
303         return false;
304     }
305 }
306 
307 /**
308  * Returns: true if the given ID type is for a keyword.
309  */
310 public bool isKeyword(IdType type) pure nothrow @safe @nogc
311 {
312     switch (type)
313     {
314     case tok!"abstract":
315     case tok!"alias":
316     case tok!"align":
317     case tok!"asm":
318     case tok!"assert":
319     case tok!"auto":
320     case tok!"body":
321     case tok!"break":
322     case tok!"case":
323     case tok!"cast":
324     case tok!"catch":
325     case tok!"class":
326     case tok!"const":
327     case tok!"continue":
328     case tok!"debug":
329     case tok!"default":
330     case tok!"delegate":
331     case tok!"delete":
332     case tok!"deprecated":
333     case tok!"do":
334     case tok!"else":
335     case tok!"enum":
336     case tok!"export":
337     case tok!"extern":
338     case tok!"false":
339     case tok!"final":
340     case tok!"finally":
341     case tok!"for":
342     case tok!"foreach":
343     case tok!"foreach_reverse":
344     case tok!"function":
345     case tok!"goto":
346     case tok!"if":
347     case tok!"immutable":
348     case tok!"import":
349     case tok!"in":
350     case tok!"inout":
351     case tok!"interface":
352     case tok!"invariant":
353     case tok!"is":
354     case tok!"lazy":
355     case tok!"macro":
356     case tok!"mixin":
357     case tok!"module":
358     case tok!"new":
359     case tok!"nothrow":
360     case tok!"null":
361     case tok!"out":
362     case tok!"override":
363     case tok!"package":
364     case tok!"pragma":
365     case tok!"private":
366     case tok!"protected":
367     case tok!"public":
368     case tok!"pure":
369     case tok!"ref":
370     case tok!"return":
371     case tok!"scope":
372     case tok!"shared":
373     case tok!"static":
374     case tok!"struct":
375     case tok!"super":
376     case tok!"switch":
377     case tok!"synchronized":
378     case tok!"template":
379     case tok!"this":
380     case tok!"throw":
381     case tok!"true":
382     case tok!"try":
383     case tok!"typedef":
384     case tok!"typeid":
385     case tok!"typeof":
386     case tok!"union":
387     case tok!"unittest":
388     case tok!"version":
389     case tok!"volatile":
390     case tok!"while":
391     case tok!"with":
392     case tok!"__DATE__":
393     case tok!"__EOF__":
394     case tok!"__FILE__":
395     case tok!"__FUNCTION__":
396     case tok!"__gshared":
397     case tok!"__LINE__":
398     case tok!"__MODULE__":
399     case tok!"__parameters":
400     case tok!"__PRETTY_FUNCTION__":
401     case tok!"__TIME__":
402     case tok!"__TIMESTAMP__":
403     case tok!"__traits":
404     case tok!"__vector":
405     case tok!"__VENDOR__":
406     case tok!"__VERSION__":
407         return true;
408     default:
409         return false;
410     }
411 }
412 
413 /**
414  * Returns: true if the given ID type is for a string literal.
415  */
416 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc
417 {
418     switch (type)
419     {
420     case tok!"dstringLiteral":
421     case tok!"stringLiteral":
422     case tok!"wstringLiteral":
423         return true;
424     default:
425         return false;
426     }
427 }
428 
429 /**
430  * Returns: true if the given ID type is for a protection attribute.
431  */
432 public bool isProtection(IdType type) pure nothrow @safe @nogc
433 {
434     switch (type)
435     {
436     case tok!"export":
437     case tok!"package":
438     case tok!"private":
439     case tok!"public":
440     case tok!"protected":
441         return true;
442     default:
443         return false;
444     }
445 }
446 
447 /**
448  * Returns: an array of tokens lexed from the given source code to the output range. All
449  * whitespace tokens are skipped and comments are attached to the token nearest
450  * to them.
451  */
452 const(Token)[] getTokensForParser(ubyte[] sourceCode, LexerConfig config,
453     StringCache* cache)
454 {
455     enum CommentType : ubyte
456     {
457         notDoc,
458         line,
459         block
460     }
461 
462     static CommentType commentType(string comment) pure nothrow @safe
463     {
464         if (comment.length < 3)
465             return CommentType.notDoc;
466         if (comment[0 ..3] == "///")
467             return CommentType.line;
468         if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**")
469             return CommentType.block;
470         return CommentType.notDoc;
471     }
472 
473     config.whitespaceBehavior = WhitespaceBehavior.skip;
474 
475     auto output = appender!(typeof(return))();
476     auto lexer = DLexer(sourceCode, config, cache);
477     string blockComment;
478     size_t tokenCount;
479     loop: while (!lexer.empty) switch (lexer.front.type)
480     {
481     case tok!"specialTokenSequence":
482     case tok!"whitespace":
483         lexer.popFront();
484         break;
485     case tok!"comment":
486         final switch (commentType(lexer.front.text))
487         {
488         case CommentType.block:
489             if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
490             {
491                 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
492             }
493 	    else
494 	    {
495                 blockComment = lexer.front.text;
496 	    }
497             lexer.popFront();
498             break;
499         case CommentType.line:
500             if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
501             {
502                 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
503             }
504             else
505             {
506 	    	string c = lexer.front.text[3 .. $]; // just take the /// off entirely
507 		if(blockComment.length == 0) {
508 			blockComment = "/++" ~ c ~ "\n+/"; // just rewrite to this
509 		} else {
510 			import std.string;
511 			auto l = blockComment.lastIndexOf("\n");
512                         string replacement;
513 			if(l != -1) {
514                                 replacement = blockComment[l .. $];
515 				blockComment = blockComment[0 .. l + 1];
516 			} else {
517                                 replacement = blockComment[$-2 .. $];
518 				blockComment = blockComment[0 .. $-2]; // just cut off the */ or +/
519 			}
520 			if(blockComment[0 .. 3] == "/**")
521 				blockComment ~= c ~ replacement;
522 			else if(blockComment[0 .. 3] == "/++")
523 				blockComment ~= c ~ replacement;
524 			else assert(0);
525 
526 		}
527             }
528             lexer.popFront();
529             break;
530         case CommentType.notDoc:
531             lexer.popFront();
532             break;
533         }
534         break;
535     case tok!"__EOF__":
536         break loop;
537     default:
538         Token t = lexer.front;
539         lexer.popFront();
540         tokenCount++;
541         t.comment = blockComment;
542         blockComment = null;
543         output.put(t);
544         break;
545     }
546     return output.data;
547 }
548 
549 /**
550  * The D lexer struct.
551  */
552 public struct DLexer
553 {
554     mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
555         keywords, pseudoTokenHandlers);
556 
557     ///
558     @disable this();
559 
560     /**
561      * Params:
562      *     range = the bytes that compose the source code that will be lexed.
563      *     config = the lexer configuration to use.
564      *     cache = the string interning cache for de-duplicating identifiers and
565      *         other token text.
566      */
567     this(ubyte[] range, const LexerConfig config, StringCache* cache,
568         bool haveSSE42 = sse42()) pure nothrow @safe
569     {
570         this.haveSSE42 = haveSSE42;
571         auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
572             ? range[3 .. $] : range;
573         this.range = LexerRange(r);
574         this.config = config;
575         this.cache = cache;
576         popFront();
577     }
578 
579     ///
580     public void popFront()() pure nothrow @safe
581     {
582         do
583             _popFront();
584         while (config.whitespaceBehavior == WhitespaceBehavior.skip
585             && _front.type == tok!"whitespace");
586     }
587 
588 private pure nothrow @safe:
589 
590     bool isWhitespace()
591     {
592         switch (range.bytes[range.index])
593         {
594         case ' ':
595         case '\r':
596         case '\n':
597         case '\t':
598         case '\v':
599         case '\f':
600             return true;
601         case 0xe2:
602             auto peek = range.peek(2);
603             return peek.length == 2
604                 && peek[0] == 0x80
605                 && (peek[1] == 0xa8 || peek[1] == 0xa9);
606         default:
607             return false;
608         }
609     }
610 
611     void popFrontWhitespaceAware()
612     {
613         switch (range.bytes[range.index])
614         {
615         case '\r':
616             range.popFront();
617             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
618             {
619                 range.popFront();
620                 range.incrementLine();
621             }
622             else
623                 range.incrementLine();
624             return;
625         case '\n':
626             range.popFront();
627             range.incrementLine();
628             return;
629         case 0xe2:
630             auto lookahead = range.peek(3);
631             if (lookahead.length == 3 && lookahead[1] == 0x80
632                 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
633             {
634                 range.index+=3;
635                 range.column+=3;
636                 range.incrementLine();
637                 return;
638             }
639             else
640             {
641                 range.popFront();
642                 return;
643             }
644         default:
645             range.popFront();
646             return;
647         }
648     }
649 
650     void lexWhitespace(ref Token token) @trusted
651     {
652         mixin (tokenStart);
653         loop: do
654         {
655             version (iasm64NotWindows)
656             {
657                 if (haveSSE42 && range.index + 16 < range.bytes.length)
658                 {
659                     skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index,
660                         &range.index, &range.column);
661                 }
662             }
663             switch (range.bytes[range.index])
664             {
665             case '\r':
666                 range.popFront();
667                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
668                 {
669                     range.popFront();
670                 }
671                 range.column = 1;
672                 range.line += 1;
673                 break;
674             case '\n':
675                 range.popFront();
676                 range.column = 1;
677                 range.line += 1;
678                 break;
679             case ' ':
680             case '\t':
681             case '\v':
682             case '\f':
683                 range.popFront();
684                 break;
685             case 0xe2:
686                 if (range.index + 2 >= range.bytes.length)
687                     break loop;
688                 if (range.bytes[range.index + 1] != 0x80)
689                     break loop;
690                 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9)
691                 {
692                     range.index += 3;
693                     range.column += 3;
694                     range.column = 1;
695                     range.line += 1;
696                     break;
697                 }
698                 break loop;
699             default:
700                 break loop;
701             }
702         } while (!(range.index >= range.bytes.length));
703     end:
704         string text = config.whitespaceBehavior == WhitespaceBehavior.include
705             ? cache.intern(range.slice(mark)) : "";
706         token = Token(tok!"whitespace", text, line, column, index);
707     }
708 
709     void lexNumber(ref Token token)
710     {
711         mixin (tokenStart);
712         if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length)
713         {
714             auto ahead = range.bytes[range.index + 1];
715             switch (ahead)
716             {
717             case 'x':
718             case 'X':
719                 range.index += 2;
720                 range.column += 2;
721                 lexHex(token, mark, line, column, index);
722                 return;
723             case 'b':
724             case 'B':
725                 range.index += 2;
726                 range.column += 2;
727                 lexBinary(token, mark, line, column, index);
728                 return;
729             default:
730                 lexDecimal(token, mark, line, column, index);
731                 return;
732             }
733         }
734         else
735             lexDecimal(token, mark, line, column, index);
736     }
737 
738     void lexHex(ref Token token)
739     {
740         mixin (tokenStart);
741         lexHex(token, mark, line, column, index);
742     }
743 
744     void lexHex(ref Token token, size_t mark, size_t line, size_t column,
745         size_t index) @trusted
746     {
747         IdType type = tok!"intLiteral";
748         bool foundDot;
749         hexLoop: while (!(range.index >= range.bytes.length))
750         {
751             switch (range.bytes[range.index])
752             {
753             case 'a': .. case 'f':
754             case 'A': .. case 'F':
755             case '0': .. case '9':
756             case '_':
757                 version (iasm64NotWindows)
758                 {
759                     if (haveSSE42 && range.index + 16 < range.bytes.length)
760                     {
761                         immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_')
762                             (range.bytes.ptr + range.index);
763                         range.column += i;
764                         range.index += i;
765                     }
766                     else
767                         range.popFront();
768                 }
769                 else
770                     range.popFront();
771                 break;
772             case 'u':
773             case 'U':
774                 lexIntSuffix(type);
775                 break hexLoop;
776             case 'i':
777                 if (foundDot)
778                     lexFloatSuffix(type);
779                 break hexLoop;
780             case 'L':
781                 if (foundDot)
782                     lexFloatSuffix(type);
783                 else
784                     lexIntSuffix(type);
785                 break hexLoop;
786             case 'p':
787             case 'P':
788                 lexExponent(type);
789                 break hexLoop;
790             case '.':
791                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
792                     break hexLoop;
793                 else
794                 {
795                     // The following bit of silliness tries to tell the
796                     // difference between "int dot identifier" and
797                     // "double identifier".
798                     if ((range.index + 1 < range.bytes.length))
799                     {
800                         switch (range.peekAt(1))
801                         {
802                         case '0': .. case '9':
803                         case 'A': .. case 'F':
804                         case 'a': .. case 'f':
805                             goto doubleLiteral;
806                         default:
807                             break hexLoop;
808                         }
809                     }
810                     else
811                     {
812                     doubleLiteral:
813                         range.popFront();
814                         foundDot = true;
815                         type = tok!"doubleLiteral";
816                     }
817                 }
818                 break;
819             default:
820                 break hexLoop;
821             }
822         }
823         token = Token(type, cache.intern(range.slice(mark)), line, column,
824             index);
825     }
826 
827     void lexBinary(ref Token token)
828     {
829         mixin (tokenStart);
830         return lexBinary(token, mark, line, column, index);
831     }
832 
833     void lexBinary(ref Token token, size_t mark, size_t line, size_t column,
834         size_t index) @trusted
835     {
836         IdType type = tok!"intLiteral";
837         binaryLoop: while (!(range.index >= range.bytes.length))
838         {
839             switch (range.bytes[range.index])
840             {
841             case '0':
842             case '1':
843             case '_':
844                 version (iasm64NotWindows)
845                 {
846                     if (haveSSE42 && range.index + 16 < range.bytes.length)
847                     {
848                         immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')(
849                             range.bytes.ptr + range.index);
850                         range.column += i;
851                         range.index += i;
852                     }
853                     else
854                         range.popFront();
855                 }
856                 else
857                     range.popFront();
858                 break;
859             case 'u':
860             case 'U':
861             case 'L':
862                 lexIntSuffix(type);
863                 break binaryLoop;
864             default:
865                 break binaryLoop;
866             }
867         }
868         token = Token(type, cache.intern(range.slice(mark)), line, column,
869             index);
870     }
871 
872     void lexDecimal(ref Token token)
873     {
874         mixin (tokenStart);
875         lexDecimal(token, mark, line, column, index);
876     }
877 
878     void lexDecimal(ref Token token, size_t mark, size_t line, size_t column,
879         size_t index) @trusted
880     {
881         bool foundDot = range.bytes[range.index] == '.';
882         IdType type = tok!"intLiteral";
883         if (foundDot)
884         {
885             range.popFront();
886             type = tok!"doubleLiteral";
887         }
888 
889         decimalLoop: while (!(range.index >= range.bytes.length))
890         {
891             switch (range.bytes[range.index])
892             {
893             case '0': .. case '9':
894             case '_':
895                 version (iasm64NotWindows)
896                 {
897                     if (haveSSE42 && range.index + 16 < range.bytes.length)
898                     {
899                         ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index);
900                         range.column += i;
901                         range.index += i;
902                     }
903                     else
904                         range.popFront();
905                 }
906                 else
907                     range.popFront();
908                 break;
909             case 'u':
910             case 'U':
911                 if (!foundDot)
912                     lexIntSuffix(type);
913                 break decimalLoop;
914             case 'i':
915                 lexFloatSuffix(type);
916                 break decimalLoop;
917             case 'L':
918                 if (foundDot)
919                     lexFloatSuffix(type);
920                 else
921                     lexIntSuffix(type);
922                 break decimalLoop;
923             case 'f':
924             case 'F':
925                 lexFloatSuffix(type);
926                 break decimalLoop;
927             case 'e':
928             case 'E':
929                 lexExponent(type);
930                 break decimalLoop;
931             case '.':
932                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
933                     break decimalLoop;
934                 else
935                 {
936                     // The following bit of silliness tries to tell the
937                     // difference between "int dot identifier" and
938                     // "double identifier".
939                     if ((range.index + 1 < range.bytes.length))
940                     {
941                         auto ch = range.peekAt(1);
942                         if (ch <= 0x2f
943                             || (ch >= '0' && ch <= '9')
944                             || (ch >= ':' && ch <= '@')
945                             || (ch >= '[' && ch <= '^')
946                             || (ch >= '{' && ch <= '~')
947                             || ch == '`' || ch == '_')
948                         {
949                             goto doubleLiteral;
950                         }
951                         else
952                             break decimalLoop;
953                     }
954                     else
955                     {
956                     doubleLiteral:
957                         range.popFront();
958                         foundDot = true;
959                         type = tok!"doubleLiteral";
960                     }
961                 }
962                 break;
963             default:
964                 break decimalLoop;
965             }
966         }
967         token = Token(type, cache.intern(range.slice(mark)), line, column,
968             index);
969     }
970 
971     void lexIntSuffix(ref IdType type)
972     {
973         bool secondPass;
974         if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U')
975         {
976     U:
977             if (type == tok!"intLiteral")
978                 type = tok!"uintLiteral";
979             else
980                 type = tok!"ulongLiteral";
981             range.popFront();
982             if (secondPass)
983                 return;
984             if (range.index < range.bytes.length && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l'))
985                 goto L;
986             return;
987         }
988         if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')
989         {
990     L:
991             if (type == tok!"uintLiteral")
992                 type = tok!"ulongLiteral";
993             else
994                 type = tok!"longLiteral";
995             range.popFront();
996             if (!secondPass && range.index < range.bytes.length && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u'))
997             {
998                 secondPass = true;
999                 goto U;
1000             }
1001             return;
1002         }
1003     }
1004 
1005     void lexFloatSuffix(ref IdType type) pure nothrow @safe
1006     {
1007         switch (range.bytes[range.index])
1008         {
1009         case 'L':
1010             range.popFront();
1011             type = tok!"doubleLiteral";
1012             break;
1013         case 'f':
1014         case 'F':
1015             range.popFront();
1016             type = tok!"floatLiteral";
1017             break;
1018         default:
1019             break;
1020         }
1021         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == 'i')
1022         {
1023             warning("Complex number literals are deprecated");
1024             range.popFront();
1025             if (type == tok!"floatLiteral")
1026                 type = tok!"ifloatLiteral";
1027             else
1028                 type = tok!"idoubleLiteral";
1029         }
1030     }
1031 
1032     void lexExponent(ref IdType type) pure nothrow @safe
1033     {
1034         range.popFront();
1035         bool foundSign = false;
1036         bool foundDigit = false;
1037         while (!(range.index >= range.bytes.length))
1038         {
1039             switch (range.bytes[range.index])
1040             {
1041             case '-':
1042             case '+':
1043                 if (foundSign)
1044                 {
1045                     if (!foundDigit)
1046                     error("Expected an exponent");
1047                     return;
1048                 }
1049                 foundSign = true;
1050                 range.popFront();
1051                 break;
1052             case '0': .. case '9':
1053             case '_':
1054                 foundDigit = true;
1055                 range.popFront();
1056                 break;
1057             case 'L':
1058             case 'f':
1059             case 'F':
1060             case 'i':
1061                 lexFloatSuffix(type);
1062                 return;
1063             default:
1064                 if (!foundDigit)
1065                     error("Expected an exponent");
1066                 return;
1067             }
1068         }
1069     }
1070 
1071     void lexScriptLine(ref Token token)
1072     {
1073         mixin (tokenStart);
1074         while (!(range.index >= range.bytes.length) && !isNewline)
1075         {
1076             range.popFront();
1077         }
1078         token = Token(tok!"scriptLine", cache.intern(range.slice(mark)),
1079             line, column, index);
1080     }
1081 
1082     void lexSpecialTokenSequence(ref Token token)
1083     {
1084         mixin (tokenStart);
1085         while (!(range.index >= range.bytes.length) && !isNewline)
1086         {
1087             range.popFront();
1088         }
1089         token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
1090             line, column, index);
1091     }
1092 
1093     void lexSlashStarComment(ref Token token) @trusted
1094     {
1095         mixin (tokenStart);
1096         IdType type = tok!"comment";
1097         range.popFrontN(2);
1098         while (range.index < range.bytes.length)
1099         {
1100             version (iasm64NotWindows)
1101             {
1102                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1103                     skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index,
1104                         &range.index, &range.column);
1105             }
1106             if (range.bytes[range.index] == '*')
1107             {
1108                 range.popFront();
1109                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1110                 {
1111                     range.popFront();
1112                     break;
1113                 }
1114             }
1115             else
1116                 popFrontWhitespaceAware();
1117         }
1118     end:
1119         token = Token(type, cache.intern(range.slice(mark)), line, column,
1120             index);
1121     }
1122 
1123     void lexSlashSlashComment(ref Token token) @trusted
1124     {
1125         mixin (tokenStart);
1126         IdType type = tok!"comment";
1127         range.popFrontN(2);
1128         while (range.index < range.bytes.length)
1129         {
1130             version (iasm64NotWindows)
1131             {
1132                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1133                 {
1134                     skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1135                         &range.index, &range.column);
1136                 }
1137             }
1138             if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n')
1139                 break;
1140             range.popFront();
1141         }
1142     end:
1143         token =  Token(type, cache.intern(range.slice(mark)), line, column,
1144             index);
1145     }
1146 
1147     void lexSlashPlusComment(ref Token token) @trusted
1148     {
1149         mixin (tokenStart);
1150         IdType type = tok!"comment";
1151         range.index += 2;
1152         range.column += 2;
1153         int depth = 1;
1154         while (depth > 0 && !(range.index >= range.bytes.length))
1155         {
1156             version (iasm64NotWindows)
1157             {
1158                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1159                 {
1160                     skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1161                         &range.index, &range.column);
1162                 }
1163             }
1164             if (range.bytes[range.index] == '+')
1165             {
1166                 range.popFront();
1167                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1168                 {
1169                     range.popFront();
1170                     depth--;
1171                 }
1172             }
1173             else if (range.bytes[range.index] == '/')
1174             {
1175                 range.popFront();
1176                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+')
1177                 {
1178                     range.popFront();
1179                     depth++;
1180                 }
1181             }
1182             else
1183                 popFrontWhitespaceAware();
1184         }
1185         token = Token(type, cache.intern(range.slice(mark)), line, column,
1186             index);
1187     }
1188 
1189     void lexStringLiteral(ref Token token) @trusted
1190     {
1191         mixin (tokenStart);
1192         range.popFront();
1193         while (true)
1194         {
1195             if ((range.index >= range.bytes.length))
1196             {
1197                 error("Error: unterminated string literal");
1198                 token = Token(tok!"");
1199                 return;
1200             }
1201             version (iasm64NotWindows)
1202             {
1203                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1204                 {
1205                     skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1206                         &range.index, &range.column);
1207                 }
1208             }
1209             if (range.bytes[range.index] == '"')
1210             {
1211                 range.popFront();
1212                 break;
1213             }
1214             else if (range.bytes[range.index] == '\\')
1215             {
1216                 lexEscapeSequence();
1217             }
1218             else
1219                 popFrontWhitespaceAware();
1220         }
1221         IdType type = tok!"stringLiteral";
1222         lexStringSuffix(type);
1223         token = Token(type, cache.intern(range.slice(mark)), line, column,
1224             index);
1225     }
1226 
1227     void lexWysiwygString(ref Token token) @trusted
1228     {
1229         mixin (tokenStart);
1230         IdType type = tok!"stringLiteral";
1231         bool backtick = range.bytes[range.index] == '`';
1232         if (backtick)
1233         {
1234             range.popFront();
1235             while (true)
1236             {
1237                 if ((range.index >= range.bytes.length))
1238                 {
1239                     error("Error: unterminated string literal");
1240                     token = Token(tok!"");
1241                     return;
1242                 }
1243                 version (iasm64NotWindows)
1244                 {
1245                     if (haveSSE42 && range.index + 16 < range.bytes.length)
1246                     {
1247                         skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index,
1248                             &range.index, &range.column);
1249                     }
1250                 }
1251                 if (range.bytes[range.index] == '`')
1252                 {
1253                     range.popFront();
1254                     break;
1255                 }
1256                 else
1257                     popFrontWhitespaceAware();
1258             }
1259         }
1260         else
1261         {
1262             range.popFront();
1263             if ((range.index >= range.bytes.length))
1264             {
1265                 error("Error: unterminated string literal");
1266                 token = Token(tok!"");
1267                 return;
1268             }
1269             range.popFront();
1270             while (true)
1271             {
1272                 if ((range.index >= range.bytes.length))
1273                 {
1274                     error("Error: unterminated string literal");
1275                     token = Token(tok!"");
1276                     return;
1277                 }
1278                 else if (range.bytes[range.index] == '"')
1279                 {
1280                     range.popFront();
1281                     break;
1282                 }
1283                 else
1284                     popFrontWhitespaceAware();
1285             }
1286         }
1287         lexStringSuffix(type);
1288         token = Token(type, cache.intern(range.slice(mark)), line, column,
1289             index);
1290     }
1291 
1292     private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe
1293     {
1294         if ((range.index >= range.bytes.length))
1295         {
1296             type = tok!"stringLiteral";
1297             return 0;
1298         }
1299         else
1300         {
1301             switch (range.bytes[range.index])
1302             {
1303             case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w';
1304             case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd';
1305             case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c';
1306             default: type = tok!"stringLiteral"; return 0;
1307             }
1308         }
1309     }
1310 
1311     void lexDelimitedString(ref Token token)
1312     {
1313         mixin (tokenStart);
1314         range.index += 2;
1315         range.column += 2;
1316         ubyte open;
1317         ubyte close;
1318         switch (range.bytes[range.index])
1319         {
1320         case '<':
1321             open = '<';
1322             close = '>';
1323             range.popFront();
1324             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1325             break;
1326         case '{':
1327             open = '{';
1328             close = '}';
1329             range.popFront();
1330             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1331             break;
1332         case '[':
1333             open = '[';
1334             close = ']';
1335             range.popFront();
1336             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1337             break;
1338         case '(':
1339             open = '(';
1340             close = ')';
1341             range.popFront();
1342             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1343             break;
1344         default:
1345             lexHeredocString(token, mark, line, column, index);
1346             break;
1347         }
1348     }
1349 
1350     void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column,
1351         size_t index, ubyte open, ubyte close)
1352     {
1353         int depth = 1;
1354         while (!(range.index >= range.bytes.length) && depth > 0)
1355         {
1356             if (range.bytes[range.index] == open)
1357             {
1358                 depth++;
1359                 range.popFront();
1360             }
1361             else if (range.bytes[range.index] == close)
1362             {
1363                 depth--;
1364                 range.popFront();
1365                 if (depth <= 0)
1366                 {
1367                     if (range.bytes[range.index] == '"')
1368                     {
1369                         range.popFront();
1370                     }
1371                     else
1372                     {
1373                         error("Error: \" expected to end delimited string literal");
1374                         token = Token(tok!"");
1375                         return;
1376                     }
1377                 }
1378             }
1379             else
1380                 popFrontWhitespaceAware();
1381         }
1382         IdType type = tok!"stringLiteral";
1383         lexStringSuffix(type);
1384         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1385     }
1386 
1387     void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index)
1388     {
1389         Token ident;
1390         lexIdentifier(ident);
1391         if (isNewline())
1392             popFrontWhitespaceAware();
1393         else
1394             error("Newline expected");
1395         while (!(range.index >= range.bytes.length))
1396         {
1397             if (isNewline())
1398             {
1399                 popFrontWhitespaceAware();
1400                 if (!range.canPeek(ident.text.length))
1401                 {
1402                     error(ident.text ~ " expected");
1403                     break;
1404                 }
1405                 if (range.peek(ident.text.length - 1) == ident.text)
1406                 {
1407                     range.popFrontN(ident.text.length);
1408                     break;
1409                 }
1410             }
1411             else
1412             {
1413                 range.popFront();
1414             }
1415         }
1416         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"')
1417         {
1418             range.popFront();
1419         }
1420         else
1421             error(`" expected`);
1422         IdType type = tok!"stringLiteral";
1423         lexStringSuffix(type);
1424         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1425     }
1426 
1427     void lexTokenString(ref Token token)
1428     {
1429         mixin (tokenStart);
1430         assert (range.bytes[range.index] == 'q');
1431         range.popFront();
1432         assert (range.bytes[range.index] == '{');
1433         range.popFront();
1434         auto app = appender!string();
1435         app.put("q{");
1436         int depth = 1;
1437 
1438         immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior;
1439         immutable StringBehavior oldString = config.stringBehavior;
1440         config.whitespaceBehavior = WhitespaceBehavior.include;
1441         config.stringBehavior = StringBehavior.source;
1442         scope (exit)
1443         {
1444             config.whitespaceBehavior = oldWhitespace;
1445             config.stringBehavior = oldString;
1446         }
1447 
1448         advance(_front);
1449         while (depth > 0 && !empty)
1450         {
1451             auto t = front();
1452             if (t.text is null)
1453                 app.put(str(t.type));
1454             else
1455                 app.put(t.text);
1456             if (t.type == tok!"}")
1457             {
1458                 depth--;
1459                 if (depth > 0)
1460                 popFront();
1461             }
1462             else if (t.type == tok!"{")
1463             {
1464                 depth++;
1465                 popFront();
1466             }
1467             else
1468                 popFront();
1469         }
1470         IdType type = tok!"stringLiteral";
1471         auto b = lexStringSuffix(type);
1472         if (b != 0)
1473             app.put(b);
1474         token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
1475             column, index);
1476     }
1477 
1478     void lexHexString(ref Token token)
1479     {
1480         mixin (tokenStart);
1481         range.index += 2;
1482         range.column += 2;
1483 
1484         loop: while (true)
1485         {
1486             if ((range.index >= range.bytes.length))
1487             {
1488                 error("Error: unterminated hex string literal");
1489                 token = Token(tok!"");
1490                 return;
1491             }
1492             else if (isWhitespace())
1493                 popFrontWhitespaceAware();
1494             else switch (range.bytes[range.index])
1495             {
1496             case '0': .. case '9':
1497             case 'A': .. case 'F':
1498             case 'a': .. case 'f':
1499                 range.popFront();
1500                 break;
1501             case '"':
1502                 range.popFront();
1503                 break loop;
1504             default:
1505                 error("Error: invalid character in hex string");
1506                 token = Token(tok!"");
1507                 return;
1508             }
1509         }
1510 
1511         IdType type = tok!"stringLiteral";
1512         lexStringSuffix(type);
1513         token = Token(type, cache.intern(range.slice(mark)), line, column,
1514             index);
1515     }
1516 
1517     bool lexEscapeSequence()
1518     {
1519         range.popFront();
1520         if ((range.index >= range.bytes.length))
1521         {
1522             error("Error: non-terminated character escape sequence.");
1523             return false;
1524         }
1525         switch (range.bytes[range.index])
1526         {
1527         case '\'':
1528         case '"':
1529         case '?':
1530         case '\\':
1531         case 'a':
1532         case 'b':
1533         case 'f':
1534         case 'n':
1535         case 'r':
1536         case 't':
1537         case 'v':
1538             range.popFront();
1539             break;
1540         case 'x':
1541             range.popFront();
1542             foreach (i; 0 .. 2)
1543             {
1544                 if ((range.index >= range.bytes.length))
1545                 {
1546                     error("Error: 2 hex digits expected.");
1547                     return false;
1548                 }
1549                 switch (range.bytes[range.index])
1550                 {
1551                 case '0': .. case '9':
1552                 case 'a': .. case 'f':
1553                 case 'A': .. case 'F':
1554                     range.popFront();
1555                     break;
1556                 default:
1557                     error("Error: 2 hex digits expected.");
1558                     return false;
1559                 }
1560             }
1561             break;
1562         case '0':
1563             if (!(range.index + 1 < range.bytes.length) || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\''))
1564             {
1565                 range.popFront();
1566                 break;
1567             }
1568             goto case;
1569         case '1': .. case '7':
1570             for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++)
1571                 range.popFront();
1572             break;
1573         case 'u':
1574             range.popFront();
1575             foreach (i; 0 .. 4)
1576             {
1577                 if ((range.index >= range.bytes.length))
1578                 {
1579                     error("Error: at least 4 hex digits expected.");
1580                     return false;
1581                 }
1582                 switch (range.bytes[range.index])
1583                 {
1584                 case '0': .. case '9':
1585                 case 'a': .. case 'f':
1586                 case 'A': .. case 'F':
1587                     range.popFront();
1588                     break;
1589                 default:
1590                     error("Error: at least 4 hex digits expected.");
1591                     return false;
1592                 }
1593             }
1594             break;
1595         case 'U':
1596             range.popFront();
1597             foreach (i; 0 .. 8)
1598             {
1599                 if ((range.index >= range.bytes.length))
1600                 {
1601                     error("Error: at least 8 hex digits expected.");
1602                     return false;
1603                 }
1604                 switch (range.bytes[range.index])
1605                 {
1606                 case '0': .. case '9':
1607                 case 'a': .. case 'f':
1608                 case 'A': .. case 'F':
1609                     range.popFront();
1610                     break;
1611                 default:
1612                     error("Error: at least 8 hex digits expected.");
1613                     return false;
1614                 }
1615             }
1616             break;
1617         default:
1618             while (true)
1619             {
1620                 if ((range.index >= range.bytes.length))
1621                 {
1622                     error("Error: non-terminated character escape sequence.");
1623                     return false;
1624                 }
1625                 if (range.bytes[range.index] == ';')
1626                 {
1627                     range.popFront();
1628                     break;
1629                 }
1630                 else
1631                 {
1632                     range.popFront();
1633                 }
1634             }
1635         }
1636         return true;
1637     }
1638 
1639     void lexCharacterLiteral(ref Token token)
1640     {
1641         mixin (tokenStart);
1642         range.popFront();
1643         if (range.bytes[range.index] == '\\')
1644         {
1645             lexEscapeSequence();
1646             goto close;
1647         }
1648         else if (range.bytes[range.index] == '\'')
1649         {
1650             range.popFront();
1651             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1652                 line, column, index);
1653         }
1654         else if (range.bytes[range.index] & 0x80)
1655         {
1656             while (range.bytes[range.index] & 0x80)
1657             {
1658                 range.popFront();
1659             }
1660             goto close;
1661         }
1662         else
1663         {
1664             popFrontWhitespaceAware();
1665             goto close;
1666         }
1667     close:
1668         if (range.index < range.bytes.length && range.bytes[range.index] == '\'')
1669         {
1670             range.popFront();
1671             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1672                 line, column, index);
1673         }
1674         else
1675         {
1676             error("Error: Expected ' to end character literal");
1677             token = Token(tok!"");
1678         }
1679     }
1680 
1681     void lexIdentifier(ref Token token) @trusted
1682     {
1683         mixin (tokenStart);
1684         if (isSeparating(0))
1685         {
1686             error("Invalid identifier");
1687             range.popFront();
1688         }
1689         while (true)
1690         {
1691             version (iasm64NotWindows)
1692             {
1693                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1694                 {
1695                     immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_')
1696                         (range.bytes.ptr + range.index);
1697                     range.column += i;
1698                     range.index += i;
1699                 }
1700             }
1701             if (isSeparating(0))
1702                 break;
1703             else
1704                 range.popFront();
1705         }
1706         token = Token(tok!"identifier", cache.intern(range.slice(mark)), line,
1707             column, index);
1708     }
1709 
1710     void lexDot(ref Token token)
1711     {
1712         mixin (tokenStart);
1713         if (!(range.index + 1 < range.bytes.length))
1714         {
1715             range.popFront();
1716             token = Token(tok!".", null, line, column, index);
1717             return;
1718         }
1719         switch (range.peekAt(1))
1720         {
1721         case '0': .. case '9':
1722             lexNumber(token);
1723             return;
1724         case '.':
1725             range.popFront();
1726             range.popFront();
1727             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.')
1728             {
1729                 range.popFront();
1730                 token = Token(tok!"...", null, line, column, index);
1731             }
1732             else
1733                 token = Token(tok!"..", null, line, column, index);
1734             return;
1735         default:
1736             range.popFront();
1737             token = Token(tok!".", null, line, column, index);
1738             return;
1739         }
1740     }
1741 
1742     void lexLongNewline(ref Token token) @nogc
1743     {
1744         mixin (tokenStart);
1745         range.popFront();
1746         range.popFront();
1747         range.popFront();
1748         range.incrementLine();
1749         string text = config.whitespaceBehavior == WhitespaceBehavior.include
1750             ? cache.intern(range.slice(mark)) : "";
1751         token = Token(tok!"whitespace", text, line,
1752             column, index);
1753     }
1754 
1755     bool isNewline() @nogc
1756     {
1757         if (range.bytes[range.index] == '\n') return true;
1758         if (range.bytes[range.index] == '\r') return true;
1759         return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length)
1760             && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
1761     }
1762 
1763     bool isSeparating(size_t offset) @nogc
1764     {
1765         enum : ubyte
1766         {
1767             n, y, m // no, yes, maybe
1768         }
1769 
1770         if (range.index + offset >= range.bytes.length)
1771             return true;
1772         auto c = range.bytes[range.index + offset];
1773         static immutable ubyte[256] LOOKUP_TABLE = [
1774             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1775             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1776             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1777             n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y,
1778             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1779             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n,
1780             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1781             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y,
1782             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1783             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1784             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1785             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1786             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1787             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1788             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1789             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m
1790         ];
1791         immutable ubyte result = LOOKUP_TABLE[c];
1792         if (result == n)
1793             return false;
1794         if (result == y)
1795             return true;
1796         if (result == m)
1797         {
1798             auto r = range;
1799             range.popFrontN(offset);
1800             return (r.canPeek(2) && (r.peek(2) == "\u2028"
1801                 || r.peek(2) == "\u2029"));
1802         }
1803         assert (false);
1804     }
1805 
1806 
1807 
1808     enum tokenStart = q{
1809         size_t index = range.index;
1810         size_t column = range.column;
1811         size_t line = range.line;
1812         auto mark = range.mark();
1813     };
1814 
1815     void error(string message)
1816     {
1817         messages ~= Message(range.line, range.column, message, true);
1818     }
1819 
1820     void warning(string message)
1821     {
1822         messages ~= Message(range.line, range.column, message, false);
1823         assert (messages.length > 0);
1824     }
1825 
1826     static struct Message
1827     {
1828         size_t line;
1829         size_t column;
1830         string message;
1831         bool isError;
1832     }
1833 
1834     Message[] messages;
1835     StringCache* cache;
1836     LexerConfig config;
1837     bool haveSSE42;
1838 }
1839 
1840 /**
1841  * Creates a token range from the given source code. Creates a default lexer
1842  * configuration and a GC-managed string cache.
1843  */
1844 public auto byToken(ubyte[] range)
1845 {
1846     LexerConfig config;
1847     StringCache* cache = new StringCache(StringCache.defaultBucketCount);
1848     return DLexer(range, config, cache);
1849 }
1850 
1851 /**
1852  * Creates a token range from the given source code. Uses the given string
1853  * cache.
1854  */
1855 public auto byToken(ubyte[] range, StringCache* cache)
1856 {
1857     LexerConfig config;
1858     return DLexer(range, config, cache);
1859 }
1860 
1861 /**
1862  * Creates a token range from the given source code. Uses the provided lexer
1863  * configuration and string cache.
1864  */
1865 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
1866 {
1867     return DLexer(range, config, cache);
1868 }
1869 
1870 /**
1871  * Removes "decoration" such as leading whitespace, leading + and * characters,
1872  * and places the result into the given output range
1873  */
1874 public void unDecorateComment(T)(string comment, auto ref T outputRange)
1875     if (isOutputRange!(T, string))
1876 in
1877 {
1878     assert (comment.length >= 3);
1879 }
1880 body
1881 {
1882     switch (comment[0 .. 3])
1883     {
1884     case "///":
1885         size_t i = 3;
1886         if (i < comment.length)
1887         {
1888         again:
1889             while (i < comment.length && (comment[i] == ' ' || comment[i] == '\t'))
1890                 i++;
1891             size_t j = i + 1;
1892             while (j < comment.length)
1893             {
1894                 if (comment[j] == '\r')
1895                     j++;
1896                 if (j >= comment.length)
1897                     break;
1898                 if (comment[j] == '\n')
1899                 {
1900                     outputRange.put(comment[i .. j]);
1901                     j++;
1902                     while (j < comment.length && comment[j] == '/')
1903                         j++;
1904                     outputRange.put('\n');
1905                     i = j;
1906                     goto again;
1907                 }
1908                 j++;
1909             }
1910             if (i < comment.length && j <= comment.length)
1911                 outputRange.put(comment[i .. j]);
1912         }
1913         break;
1914     case "/++":
1915     case "/**":
1916         size_t i = 3;
1917         immutable char c = comment[1];
1918         // Skip leading * and + characters
1919         while (comment[i] == c) i++;
1920         // Skip trailing * and + characters
1921         size_t j = comment.length - 2;
1922         while (j > i && comment[j] == c)
1923             j--;
1924         while (j > i && (comment[j] == ' ' || comment[j] == '\t'))
1925             j--;
1926         j++;
1927         size_t k = i;
1928         while (k < j)
1929         {
1930             if (comment[k] == '\n')
1931             {
1932                 k++;
1933                 break;
1934             }
1935             k++;
1936         }
1937         outputRange.put(comment[i .. k]);
1938         i = k;
1939         if (comment[i] == '\r') i++;
1940         if (comment[i] == '\n') i++;
1941         while (comment[i] == ' ' || comment[i] == '\t') i++;
1942         immutable bool skipBeginningChar = comment[i] == c;
1943         if (skipBeginningChar)
1944             i++;
1945         size_t whitespaceToSkip;
1946         while (comment[i] == ' ' || comment[i] == '\t')
1947         {
1948             whitespaceToSkip++;
1949             i++;
1950         }
1951         size_t l = i;
1952         while (i < j)
1953         {
1954             if (comment[i++] == '\n')
1955                 break;
1956         }
1957         outputRange.put(comment[l .. i]);
1958         while (true)
1959         {
1960             if (skipBeginningChar)
1961             {
1962                 while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++;
1963                 if (i < j && comment[i] == c) i++;
1964             }
1965             for (size_t s = 0; (i < j) && (s < whitespaceToSkip)
1966                 && (comment[i] == ' ' || comment[i] == '\t');)
1967             {
1968                 s++;
1969                 i++;
1970             }
1971             k = i;
1972             inner: while (k < j)
1973             {
1974                 if (comment[k] == '\n')
1975                 {
1976                     k++;
1977                     break inner;
1978                 }
1979                 k++;
1980             }
1981             outputRange.put(comment[i .. k]);
1982             i = k;
1983             if (i >= j)
1984                 break;
1985         }
1986         break;
1987     default:
1988         outputRange.put(comment);
1989         break;
1990     }
1991 }
1992 
1993 
1994 /**
1995  * The string cache is used for string interning.
1996  *
1997  * It will only store a single copy of any string that it is asked to hold.
1998  * Interned strings can be compared for equality by comparing their $(B .ptr)
1999  * field.
2000  *
2001  * Default and postbilt constructors are disabled. When a StringCache goes out
2002  * of scope, the memory held by it is freed.
2003  *
2004  * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning)
2005  */
2006 struct StringCache
2007 {
2008 public pure nothrow @nogc:
2009 
2010     @disable this();
2011     @disable this(this);
2012 
2013     /**
2014      * Params: bucketCount = the initial number of buckets. Must be a
2015      * power of two
2016      */
2017     this(size_t bucketCount) nothrow @trusted @nogc
2018     in
2019     {
2020         import core.bitop : popcnt;
2021         static if (size_t.sizeof == 8)
2022         {
2023             immutable low = popcnt(cast(uint) bucketCount);
2024             immutable high = popcnt(cast(uint) (bucketCount >> 32));
2025             assert ((low == 0 && high == 1) || (low == 1 && high == 0));
2026         }
2027         else
2028         {
2029             static assert (size_t.sizeof == 4);
2030             assert (popcnt(cast(uint) bucketCount) == 1);
2031         }
2032     }
2033     body
2034     {
2035         buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
2036     }
2037 
2038     void freeItAll()
2039     {
2040         Block* current = rootBlock;
2041         while (current !is null)
2042         {
2043             Block* prev = current;
2044             current = current.next;
2045             free(cast(void*) prev);
2046         }
2047         foreach (nodePointer; buckets)
2048         {
2049             Node* currentNode = nodePointer;
2050             while (currentNode !is null)
2051             {
2052                 if (currentNode.mallocated)
2053                     free(currentNode.str.ptr);
2054                 Node* prev = currentNode;
2055                 currentNode = currentNode.next;
2056                 free(prev);
2057             }
2058         }
2059         rootBlock = null;
2060         free(buckets.ptr);
2061         buckets = null;
2062     }
2063 
2064     /**
2065      * Caches a string.
2066      */
2067     string intern(const(ubyte)[] str) @safe
2068     {
2069         if (str is null || str.length == 0)
2070             return "";
2071         return _intern(str);
2072     }
2073 
2074     /**
2075      * ditto
2076      */
2077     string intern(string str) @trusted
2078     {
2079         return intern(cast(ubyte[]) str);
2080     }
2081 
2082     /**
2083      * The default bucket count for the string cache.
2084      */
2085     static enum defaultBucketCount = 4096;
2086 
2087 private:
2088 
2089     string _intern(const(ubyte)[] bytes) @trusted
2090     {
2091         immutable uint hash = hashBytes(bytes);
2092         immutable size_t index = hash & (buckets.length - 1);
2093         Node* s = find(bytes, hash);
2094         if (s !is null)
2095             return cast(string) s.str;
2096         ubyte[] mem = void;
2097         bool mallocated = bytes.length > BIG_STRING;
2098         if (mallocated)
2099             mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length];
2100         else
2101             mem = allocate(bytes.length);
2102         mem[] = bytes[];
2103         Node* node = cast(Node*) malloc(Node.sizeof);
2104         node.str = mem;
2105         node.hash = hash;
2106         node.next = buckets[index];
2107         node.mallocated = mallocated;
2108         buckets[index] = node;
2109         return cast(string) mem;
2110     }
2111 
2112     Node* find(const(ubyte)[] bytes, uint hash) @trusted
2113     {
2114         import std.algorithm : equal;
2115         immutable size_t index = hash & (buckets.length - 1);
2116         Node* node = buckets[index];
2117         while (node !is null)
2118         {
2119             if (node.hash == hash && bytes == cast(ubyte[]) node.str)
2120                 return node;
2121             node = node.next;
2122         }
2123         return node;
2124     }
2125 
2126     static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc
2127     in
2128     {
2129         assert (data !is null);
2130         assert (data.length > 0);
2131     }
2132     body
2133     {
2134         immutable uint m = 0x5bd1e995;
2135         immutable int r = 24;
2136         uint h = cast(uint) data.length;
2137         while (data.length >= 4)
2138         {
2139             uint k = (cast(ubyte) data[3]) << 24
2140                 | (cast(ubyte) data[2]) << 16
2141                 | (cast(ubyte) data[1]) << 8
2142                 | (cast(ubyte) data[0]);
2143             k *= m;
2144             k ^= k >> r;
2145             k *= m;
2146             h *= m;
2147             h ^= k;
2148             data = data[4 .. $];
2149         }
2150         switch (data.length & 3)
2151         {
2152         case 3:
2153             h ^= data[2] << 16;
2154             goto case;
2155         case 2:
2156             h ^= data[1] << 8;
2157             goto case;
2158         case 1:
2159             h ^= data[0];
2160             h *= m;
2161             break;
2162         default:
2163             break;
2164         }
2165         h ^= h >> 13;
2166         h *= m;
2167         h ^= h >> 15;
2168         return h;
2169     }
2170 
2171     ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc
2172     in
2173     {
2174         assert (numBytes != 0);
2175     }
2176     out (result)
2177     {
2178         assert (result.length == numBytes);
2179     }
2180     body
2181     {
2182         Block* r = rootBlock;
2183         size_t i = 0;
2184         while  (i <= 3 && r !is null)
2185         {
2186             immutable size_t available = r.bytes.length;
2187             immutable size_t oldUsed = r.used;
2188             immutable size_t newUsed = oldUsed + numBytes;
2189             if (newUsed <= available)
2190             {
2191                 r.used = newUsed;
2192                 return r.bytes[oldUsed .. newUsed];
2193             }
2194             i++;
2195             r = r.next;
2196         }
2197         Block* b = cast(Block*) calloc(Block.sizeof, 1);
2198         b.used = numBytes;
2199         b.next = rootBlock;
2200         rootBlock = b;
2201         return b.bytes[0 .. numBytes];
2202     }
2203 
2204     static struct Node
2205     {
2206         ubyte[] str = void;
2207         Node* next = void;
2208         uint hash = void;
2209         bool mallocated = void;
2210     }
2211 
2212     static struct Block
2213     {
2214         Block* next;
2215         size_t used;
2216         enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof;
2217         ubyte[BLOCK_CAPACITY] bytes;
2218     }
2219 
2220     static assert (BLOCK_SIZE == Block.sizeof);
2221 
2222     enum BLOCK_SIZE = 1024 * 16;
2223 
2224     // If a string would take up more than 1/4 of a block, allocate it outside
2225     // of the block.
2226     enum BIG_STRING = BLOCK_SIZE / 4;
2227 
2228     Node*[] buckets;
2229     Block* rootBlock;
2230 }
2231 
2232 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted;
2233 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted;
2234 private extern(C) void free(void*) nothrow pure @nogc @trusted;
2235 
2236 unittest
2237 {
2238     auto source = cast(ubyte[]) q{ import std.stdio;}};
2239     auto tokens = getTokensForParser(source, LexerConfig(),
2240         new StringCache(StringCache.defaultBucketCount));
2241     assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
2242         tok!"identifier", tok!";"]));
2243 }
2244 
2245 /// Test \x char sequence
2246 unittest
2247 {
2248     auto toks = (string s) => byToken(cast(ubyte[])s);
2249 
2250     // valid
2251     enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
2252     auto source = "";
2253     foreach (h1; hex)
2254         foreach (h2; hex)
2255             source ~= "'\\x" ~ h1 ~ h2 ~ "'";
2256     assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
2257 
2258     // invalid
2259     assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2260     assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2261     assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2262     assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2263     assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2264 }
2265 
2266 version (iasm64NotWindows)
2267 {
2268     /**
2269      * Returns:
2270      */
2271     ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc
2272     {
2273         asm pure nothrow @nogc
2274         {
2275             naked;
2276             movdqu XMM1, [RDI];
2277             mov RAX, 3;
2278             mov RDX, 16;
2279             mov R8, 0x0d0d0d0d0d0d0d0dL;
2280             movq XMM2, R8;
2281             shufpd XMM2, XMM2, 0;
2282             pcmpeqb XMM2, XMM1;
2283             mov R9, 0x0a0a0a0a0a0a0a0aL;
2284             movq XMM3, R9;
2285             shufpd XMM3, XMM3, 0;
2286             pcmpeqb XMM3, XMM1;
2287             mov R10, 0xe280a8L;
2288             movq XMM4, R10;
2289             pcmpestrm XMM4, XMM1, 0b01001100;
2290             movdqa XMM4, XMM0;
2291             mov R11, 0xe280a9L;
2292             movq XMM5, R11;
2293             pcmpestrm XMM5, XMM1, 0b01001100;
2294             movdqa XMM5, XMM0;
2295             mov RCX, 0x0a0d;
2296             dec RAX;
2297             movq XMM6, RCX;
2298             pcmpestrm XMM6, XMM1, 0b01001100;
2299             movdqa XMM6, XMM0;
2300             movdqa XMM7, XMM6;
2301             pslldq XMM7, 1;
2302             movdqa XMM0, XMM4;
2303             por XMM0, XMM5;
2304             por XMM7, XMM6;
2305             movdqa XMM1, XMM2;
2306             por XMM1, XMM3;
2307             pxor XMM7, XMM1;
2308             por XMM7, XMM0;
2309             por XMM7, XMM6;
2310             pmovmskb RAX, XMM7;
2311             and RAX, 0b0011_1111_1111_1111;
2312             ret;
2313         }
2314     }
2315 
2316     /**
2317      * Skips between 0 and 16 bytes that match (or do not match) one of the
2318      * given $(B chars).
2319      */
2320     void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow
2321         @trusted @nogc if (chars.length <= 8)
2322     {
2323         enum constant = ByteCombine!chars;
2324         enum charsLength = chars.length;
2325         static if (matching)
2326             enum flags = 0b0001_0000;
2327         else
2328             enum flags = 0b0000_0000;
2329         asm pure nothrow @nogc
2330         {
2331             naked;
2332             movdqu XMM1, [RDX];
2333             mov R10, constant;
2334             movq XMM2, R10;
2335             mov RAX, charsLength;
2336             mov RDX, 16;
2337             pcmpestri XMM2, XMM1, flags;
2338             add [RSI], RCX;
2339             add [RDI], RCX;
2340             ret;
2341         }
2342     }
2343 
2344     /**
2345      * Returns: the number of bytes starting at the given location that match
2346      *     (or do not match if $(B invert) is true) the byte ranges in $(B chars).
2347      */
2348     ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc
2349     {
2350         static assert (chars.length % 2 == 0);
2351         enum constant = ByteCombine!chars;
2352         static if (invert)
2353             enum rangeMatchFlags = 0b0000_0100;
2354         else
2355             enum rangeMatchFlags = 0b0001_0100;
2356         enum charsLength = chars.length;
2357         asm pure nothrow @nogc
2358         {
2359             naked;
2360             movdqu XMM1, [RDI];
2361             mov R10, constant;
2362             movq XMM2, R10;
2363             mov RAX, charsLength;
2364             mov RDX, 16;
2365             pcmpestri XMM2, XMM1, rangeMatchFlags;
2366             mov RAX, RCX;
2367             ret;
2368         }
2369     }
2370 
2371     template ByteCombine(c...)
2372     {
2373         static assert (c.length <= 8);
2374         static if (c.length > 1)
2375             enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8);
2376         else
2377             enum ulong ByteCombine = c[0];
2378     }
2379 }