1 module dparse.lexer;
2 
3 import std.typecons;
4 import std.typetuple;
5 import std.array;
6 import std.algorithm;
7 import std.range;
8 import std.experimental.lexer;
9 import core.cpuid : sse42;
10 version (D_InlineAsm_X86_64)
11 {
12     version (Windows) {}
13     else version = iasm64NotWindows;
14 }
15 
16 /// Operators
17 private enum operators = [
18     ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
19     "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
20     "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
21     "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
22     "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
23 ];
24 
25 /// Kewords
26 private enum keywords = [
27     "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool",
28     "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
29     "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
30     "delegate", "delete", "deprecated", "do", "double", "else", "enum",
31     "export", "extern", "false", "final", "finally", "float", "for", "foreach",
32     "foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
33     "immutable", "import", "in", "inout", "int", "interface", "invariant",
34     "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
35     "null", "out", "override", "package", "pragma", "private", "protected",
36     "public", "pure", "real", "ref", "return", "scope", "shared", "short",
37     "static", "struct", "super", "switch", "synchronized", "template", "this",
38     "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
39     "uint", "ulong", "union", "unittest", "ushort", "version", "void",
40     "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
41     "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters",
42     "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector",
43     "__VENDOR__", "__VERSION__"
44 ];
45 
46 /// Other tokens
47 private enum dynamicTokens = [
48     "specialTokenSequence", "comment", "identifier", "scriptLine",
49     "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
50     "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
51     "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
52     "dstringLiteral", "stringLiteral", "wstringLiteral"
53 ];
54 
55 private enum pseudoTokenHandlers = [
56     "\"", "lexStringLiteral",
57     "`", "lexWysiwygString",
58     "//", "lexSlashSlashComment",
59     "/*", "lexSlashStarComment",
60     "/+", "lexSlashPlusComment",
61     ".", "lexDot",
62     "'", "lexCharacterLiteral",
63     "0", "lexNumber",
64     "1", "lexDecimal",
65     "2", "lexDecimal",
66     "3", "lexDecimal",
67     "4", "lexDecimal",
68     "5", "lexDecimal",
69     "6", "lexDecimal",
70     "7", "lexDecimal",
71     "8", "lexDecimal",
72     "9", "lexDecimal",
73     "q\"", "lexDelimitedString",
74     "q{", "lexTokenString",
75     "r\"", "lexWysiwygString",
76     "x\"", "lexHexString",
77     " ", "lexWhitespace",
78     "\t", "lexWhitespace",
79     "\r", "lexWhitespace",
80     "\n", "lexWhitespace",
81     "\v", "lexWhitespace",
82     "\f", "lexWhitespace",
83     "\u2028", "lexLongNewline",
84     "\u2029", "lexLongNewline",
85     "#!", "lexScriptLine",
86     "#line", "lexSpecialTokenSequence"
87 ];
88 
89 /// Token ID type for the D lexer.
90 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
91 
92 /**
93  * Function used for converting an IdType to a string.
94  *
95  * Examples:
96  * ---
97  * IdType c = tok!"case";
98  * assert (str(c) == "case");
99  * ---
100  */
101 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
102 
103 /**
104  * Template used to refer to D token types.
105  *
106  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
107  * values that can be passed to this template.
108  * Example:
109  * ---
110  * import dparse.lexer;
111  * IdType t = tok!"floatLiteral";
112  * ---
113  */
114 public template tok(string token)
115 {
116     alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
117 }
118 
119 private enum extraFields = q{
120     string comment;
121     string trailingComment;
122 
123     int opCmp(size_t i) const pure nothrow @safe {
124         if (index < i) return -1;
125         if (index > i) return 1;
126         return 0;
127     }
128 
129     int opCmp(ref const typeof(this) other) const pure nothrow @safe {
130         return opCmp(other.index);
131     }
132 };
133 
134 /// The token type in the D lexer
135 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields);
136 
137 /**
138  * Configure whitespace handling
139  */
140 public enum WhitespaceBehavior : ubyte
141 {
142     include = 0b0000_0000,
143     skip = 0b0000_0001,
144 }
145 
146 /**
147  * Configure string lexing behavior
148  */
149 public enum StringBehavior : ubyte
150 {
151     /// Do not include quote characters, process escape sequences
152     compiler = 0b0000_0000,
153     /// Opening quotes, closing quotes, and string suffixes are included in the
154     /// string token
155     includeQuoteChars = 0b0000_0001,
156     /// String escape sequences are not replaced
157     notEscaped = 0b0000_0010,
158     /// Not modified at all. Useful for formatters or highlighters
159     source = includeQuoteChars | notEscaped
160 }
161 
162 /**
163  * Lexer configuration struct
164  */
165 public struct LexerConfig
166 {
167     string fileName;
168     StringBehavior stringBehavior;
169     WhitespaceBehavior whitespaceBehavior;
170 }
171 
172 /**
173  * Returns: true if the given ID is for a basic type.
174  */
175 public bool isBasicType(IdType type) nothrow pure @safe @nogc
176 {
177     switch (type)
178     {
179     case tok!"int":
180     case tok!"uint":
181     case tok!"double":
182     case tok!"idouble":
183     case tok!"float":
184     case tok!"ifloat":
185     case tok!"short":
186     case tok!"ushort":
187     case tok!"long":
188     case tok!"ulong":
189     case tok!"char":
190     case tok!"wchar":
191     case tok!"dchar":
192     case tok!"bool":
193     case tok!"void":
194     case tok!"cent":
195     case tok!"ucent":
196     case tok!"real":
197     case tok!"ireal":
198     case tok!"byte":
199     case tok!"ubyte":
200     case tok!"cdouble":
201     case tok!"cfloat":
202     case tok!"creal":
203         return true;
204     default:
205         return false;
206     }
207 }
208 
209 /**
210  * Returns: true if the given ID type is for a number literal.
211  */
212 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc
213 {
214     switch (type)
215     {
216     case tok!"doubleLiteral":
217     case tok!"floatLiteral":
218     case tok!"idoubleLiteral":
219     case tok!"ifloatLiteral":
220     case tok!"intLiteral":
221     case tok!"longLiteral":
222     case tok!"realLiteral":
223     case tok!"irealLiteral":
224     case tok!"uintLiteral":
225     case tok!"ulongLiteral":
226         return true;
227     default:
228         return false;
229     }
230 }
231 
232 /**
233  * Returns: true if the given ID type is for an operator.
234  */
235 public bool isOperator(IdType type) nothrow pure @safe @nogc
236 {
237     switch (type)
238     {
239     case tok!",":
240     case tok!".":
241     case tok!"..":
242     case tok!"...":
243     case tok!"/":
244     case tok!"/=":
245     case tok!"!":
246     case tok!"!<":
247     case tok!"!<=":
248     case tok!"!<>":
249     case tok!"!<>=":
250     case tok!"!=":
251     case tok!"!>":
252     case tok!"!>=":
253     case tok!"$":
254     case tok!"%":
255     case tok!"%=":
256     case tok!"&":
257     case tok!"&&":
258     case tok!"&=":
259     case tok!"(":
260     case tok!")":
261     case tok!"*":
262     case tok!"*=":
263     case tok!"+":
264     case tok!"++":
265     case tok!"+=":
266     case tok!"-":
267     case tok!"--":
268     case tok!"-=":
269     case tok!":":
270     case tok!";":
271     case tok!"<":
272     case tok!"<<":
273     case tok!"<<=":
274     case tok!"<=":
275     case tok!"<>":
276     case tok!"<>=":
277     case tok!"=":
278     case tok!"==":
279     case tok!"=>":
280     case tok!">":
281     case tok!">=":
282     case tok!">>":
283     case tok!">>=":
284     case tok!">>>":
285     case tok!">>>=":
286     case tok!"?":
287     case tok!"@":
288     case tok!"[":
289     case tok!"]":
290     case tok!"^":
291     case tok!"^=":
292     case tok!"^^":
293     case tok!"^^=":
294     case tok!"{":
295     case tok!"|":
296     case tok!"|=":
297     case tok!"||":
298     case tok!"}":
299     case tok!"~":
300     case tok!"~=":
301         return true;
302     default:
303         return false;
304     }
305 }
306 
307 /**
308  * Returns: true if the given ID type is for a keyword.
309  */
310 public bool isKeyword(IdType type) pure nothrow @safe @nogc
311 {
312     switch (type)
313     {
314     case tok!"abstract":
315     case tok!"alias":
316     case tok!"align":
317     case tok!"asm":
318     case tok!"assert":
319     case tok!"auto":
320     case tok!"body":
321     case tok!"break":
322     case tok!"case":
323     case tok!"cast":
324     case tok!"catch":
325     case tok!"class":
326     case tok!"const":
327     case tok!"continue":
328     case tok!"debug":
329     case tok!"default":
330     case tok!"delegate":
331     case tok!"delete":
332     case tok!"deprecated":
333     case tok!"do":
334     case tok!"else":
335     case tok!"enum":
336     case tok!"export":
337     case tok!"extern":
338     case tok!"false":
339     case tok!"final":
340     case tok!"finally":
341     case tok!"for":
342     case tok!"foreach":
343     case tok!"foreach_reverse":
344     case tok!"function":
345     case tok!"goto":
346     case tok!"if":
347     case tok!"immutable":
348     case tok!"import":
349     case tok!"in":
350     case tok!"inout":
351     case tok!"interface":
352     case tok!"invariant":
353     case tok!"is":
354     case tok!"lazy":
355     case tok!"macro":
356     case tok!"mixin":
357     case tok!"module":
358     case tok!"new":
359     case tok!"nothrow":
360     case tok!"null":
361     case tok!"out":
362     case tok!"override":
363     case tok!"package":
364     case tok!"pragma":
365     case tok!"private":
366     case tok!"protected":
367     case tok!"public":
368     case tok!"pure":
369     case tok!"ref":
370     case tok!"return":
371     case tok!"scope":
372     case tok!"shared":
373     case tok!"static":
374     case tok!"struct":
375     case tok!"super":
376     case tok!"switch":
377     case tok!"synchronized":
378     case tok!"template":
379     case tok!"this":
380     case tok!"throw":
381     case tok!"true":
382     case tok!"try":
383     case tok!"typedef":
384     case tok!"typeid":
385     case tok!"typeof":
386     case tok!"union":
387     case tok!"unittest":
388     case tok!"version":
389     case tok!"volatile":
390     case tok!"while":
391     case tok!"with":
392     case tok!"__DATE__":
393     case tok!"__EOF__":
394     case tok!"__FILE__":
395     case tok!"__FUNCTION__":
396     case tok!"__gshared":
397     case tok!"__LINE__":
398     case tok!"__MODULE__":
399     case tok!"__parameters":
400     case tok!"__PRETTY_FUNCTION__":
401     case tok!"__TIME__":
402     case tok!"__TIMESTAMP__":
403     case tok!"__traits":
404     case tok!"__vector":
405     case tok!"__VENDOR__":
406     case tok!"__VERSION__":
407         return true;
408     default:
409         return false;
410     }
411 }
412 
413 /**
414  * Returns: true if the given ID type is for a string literal.
415  */
416 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc
417 {
418     switch (type)
419     {
420     case tok!"dstringLiteral":
421     case tok!"stringLiteral":
422     case tok!"wstringLiteral":
423         return true;
424     default:
425         return false;
426     }
427 }
428 
429 /**
430  * Returns: true if the given ID type is for a protection attribute.
431  */
432 public bool isProtection(IdType type) pure nothrow @safe @nogc
433 {
434     switch (type)
435     {
436     case tok!"export":
437     case tok!"package":
438     case tok!"private":
439     case tok!"public":
440     case tok!"protected":
441         return true;
442     default:
443         return false;
444     }
445 }
446 
447 /**
448  * Returns: an array of tokens lexed from the given source code to the output range. All
449  * whitespace tokens are skipped and comments are attached to the token nearest
450  * to them.
451  */
452 const(Token)[] getTokensForParser(ubyte[] sourceCode, LexerConfig config,
453     StringCache* cache)
454 {
455     enum CommentType : ubyte
456     {
457         notDoc,
458         line,
459         block
460     }
461 
462     static CommentType commentType(string comment) pure nothrow @safe
463     {
464         if (comment.length < 3)
465             return CommentType.notDoc;
466         if (comment[0 ..3] == "///")
467             return CommentType.line;
468         if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**")
469             return CommentType.block;
470         return CommentType.notDoc;
471     }
472 
473     config.whitespaceBehavior = WhitespaceBehavior.skip;
474 
475     auto output = appender!(typeof(return))();
476     auto lexer = DLexer(sourceCode, config, cache);
477     string blockComment;
478     size_t tokenCount;
479     loop: while (!lexer.empty) switch (lexer.front.type)
480     {
481     case tok!"specialTokenSequence":
482     case tok!"whitespace":
483         lexer.popFront();
484         break;
485     case tok!"comment":
486         final switch (commentType(lexer.front.text))
487         {
488         case CommentType.block:
489             if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
490             {
491                 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
492             }
493 	    else
494 	    {
495                 blockComment = lexer.front.text;
496 	    }
497             lexer.popFront();
498             break;
499         case CommentType.line:
500             if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
501             {
502                 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
503             }
504             else
505             {
506 	    	string c = lexer.front.text[3 .. $]; // just take the /// off entirely
507 		if(blockComment.length == 0) {
508 			blockComment = "/++" ~ c ~ "+/"; // just rewrite to this
509 		} else {
510 			import std..string;
511 			auto l = blockComment.lastIndexOf("\n");
512 			if(l != -1) {
513 				blockComment = blockComment[0 .. l + 1];
514 			} else {
515 				blockComment = blockComment[0 .. $-2]; // just cut off the */ or +/
516 			}
517 			if(blockComment[0 .. 3] == "/**")
518 				blockComment ~= c ~ "\n*/";
519 			else if(blockComment[0 .. 3] == "/++")
520 				blockComment ~= c ~ "\n+/";
521 			else assert(0);
522 
523 		}
524             }
525             lexer.popFront();
526             break;
527         case CommentType.notDoc:
528             lexer.popFront();
529             break;
530         }
531         break;
532     case tok!"__EOF__":
533         break loop;
534     default:
535         Token t = lexer.front;
536         lexer.popFront();
537         tokenCount++;
538         t.comment = blockComment;
539         blockComment = null;
540         output.put(t);
541         break;
542     }
543     return output.data;
544 }
545 
546 /**
547  * The D lexer struct.
548  */
549 public struct DLexer
550 {
551     mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
552         keywords, pseudoTokenHandlers);
553 
554     ///
555     @disable this();
556 
557     /**
558      * Params:
559      *     range = the bytes that compose the source code that will be lexed.
560      *     config = the lexer configuration to use.
561      *     cache = the string interning cache for de-duplicating identifiers and
562      *         other token text.
563      */
564     this(ubyte[] range, const LexerConfig config, StringCache* cache,
565         bool haveSSE42 = sse42()) pure nothrow @safe
566     {
567         this.haveSSE42 = haveSSE42;
568         auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
569             ? range[3 .. $] : range;
570         this.range = LexerRange(r);
571         this.config = config;
572         this.cache = cache;
573         popFront();
574     }
575 
576     ///
577     public void popFront()() pure nothrow @safe
578     {
579         do
580             _popFront();
581         while (config.whitespaceBehavior == WhitespaceBehavior.skip
582             && _front.type == tok!"whitespace");
583     }
584 
585 private pure nothrow @safe:
586 
587     bool isWhitespace()
588     {
589         switch (range.bytes[range.index])
590         {
591         case ' ':
592         case '\r':
593         case '\n':
594         case '\t':
595         case '\v':
596         case '\f':
597             return true;
598         case 0xe2:
599             auto peek = range.peek(2);
600             return peek.length == 2
601                 && peek[0] == 0x80
602                 && (peek[1] == 0xa8 || peek[1] == 0xa9);
603         default:
604             return false;
605         }
606     }
607 
608     void popFrontWhitespaceAware()
609     {
610         switch (range.bytes[range.index])
611         {
612         case '\r':
613             range.popFront();
614             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
615             {
616                 range.popFront();
617                 range.incrementLine();
618             }
619             else
620                 range.incrementLine();
621             return;
622         case '\n':
623             range.popFront();
624             range.incrementLine();
625             return;
626         case 0xe2:
627             auto lookahead = range.peek(3);
628             if (lookahead.length == 3 && lookahead[1] == 0x80
629                 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
630             {
631                 range.index+=3;
632                 range.column+=3;
633                 range.incrementLine();
634                 return;
635             }
636             else
637             {
638                 range.popFront();
639                 return;
640             }
641         default:
642             range.popFront();
643             return;
644         }
645     }
646 
647     void lexWhitespace(ref Token token) @trusted
648     {
649         mixin (tokenStart);
650         loop: do
651         {
652             version (iasm64NotWindows)
653             {
654                 if (haveSSE42 && range.index + 16 < range.bytes.length)
655                 {
656                     skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index,
657                         &range.index, &range.column);
658                 }
659             }
660             switch (range.bytes[range.index])
661             {
662             case '\r':
663                 range.popFront();
664                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
665                 {
666                     range.popFront();
667                 }
668                 range.column = 1;
669                 range.line += 1;
670                 break;
671             case '\n':
672                 range.popFront();
673                 range.column = 1;
674                 range.line += 1;
675                 break;
676             case ' ':
677             case '\t':
678             case '\v':
679             case '\f':
680                 range.popFront();
681                 break;
682             case 0xe2:
683                 if (range.index + 2 >= range.bytes.length)
684                     break loop;
685                 if (range.bytes[range.index + 1] != 0x80)
686                     break loop;
687                 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9)
688                 {
689                     range.index += 3;
690                     range.column += 3;
691                     range.column = 1;
692                     range.line += 1;
693                     break;
694                 }
695                 break loop;
696             default:
697                 break loop;
698             }
699         } while (!(range.index >= range.bytes.length));
700     end:
701         string text = config.whitespaceBehavior == WhitespaceBehavior.include
702             ? cache.intern(range.slice(mark)) : "";
703         token = Token(tok!"whitespace", text, line, column, index);
704     }
705 
706     void lexNumber(ref Token token)
707     {
708         mixin (tokenStart);
709         if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length)
710         {
711             auto ahead = range.bytes[range.index + 1];
712             switch (ahead)
713             {
714             case 'x':
715             case 'X':
716                 range.index += 2;
717                 range.column += 2;
718                 lexHex(token, mark, line, column, index);
719                 return;
720             case 'b':
721             case 'B':
722                 range.index += 2;
723                 range.column += 2;
724                 lexBinary(token, mark, line, column, index);
725                 return;
726             default:
727                 lexDecimal(token, mark, line, column, index);
728                 return;
729             }
730         }
731         else
732             lexDecimal(token, mark, line, column, index);
733     }
734 
735     void lexHex(ref Token token)
736     {
737         mixin (tokenStart);
738         lexHex(token, mark, line, column, index);
739     }
740 
741     void lexHex(ref Token token, size_t mark, size_t line, size_t column,
742         size_t index) @trusted
743     {
744         IdType type = tok!"intLiteral";
745         bool foundDot;
746         hexLoop: while (!(range.index >= range.bytes.length))
747         {
748             switch (range.bytes[range.index])
749             {
750             case 'a': .. case 'f':
751             case 'A': .. case 'F':
752             case '0': .. case '9':
753             case '_':
754                 version (iasm64NotWindows)
755                 {
756                     if (haveSSE42 && range.index + 16 < range.bytes.length)
757                     {
758                         immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_')
759                             (range.bytes.ptr + range.index);
760                         range.column += i;
761                         range.index += i;
762                     }
763                     else
764                         range.popFront();
765                 }
766                 else
767                     range.popFront();
768                 break;
769             case 'u':
770             case 'U':
771                 lexIntSuffix(type);
772                 break hexLoop;
773             case 'i':
774                 if (foundDot)
775                     lexFloatSuffix(type);
776                 break hexLoop;
777             case 'L':
778                 if (foundDot)
779                     lexFloatSuffix(type);
780                 else
781                     lexIntSuffix(type);
782                 break hexLoop;
783             case 'p':
784             case 'P':
785                 lexExponent(type);
786                 break hexLoop;
787             case '.':
788                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
789                     break hexLoop;
790                 else
791                 {
792                     // The following bit of silliness tries to tell the
793                     // difference between "int dot identifier" and
794                     // "double identifier".
795                     if ((range.index + 1 < range.bytes.length))
796                     {
797                         switch (range.peekAt(1))
798                         {
799                         case '0': .. case '9':
800                         case 'A': .. case 'F':
801                         case 'a': .. case 'f':
802                             goto doubleLiteral;
803                         default:
804                             break hexLoop;
805                         }
806                     }
807                     else
808                     {
809                     doubleLiteral:
810                         range.popFront();
811                         foundDot = true;
812                         type = tok!"doubleLiteral";
813                     }
814                 }
815                 break;
816             default:
817                 break hexLoop;
818             }
819         }
820         token = Token(type, cache.intern(range.slice(mark)), line, column,
821             index);
822     }
823 
824     void lexBinary(ref Token token)
825     {
826         mixin (tokenStart);
827         return lexBinary(token, mark, line, column, index);
828     }
829 
830     void lexBinary(ref Token token, size_t mark, size_t line, size_t column,
831         size_t index) @trusted
832     {
833         IdType type = tok!"intLiteral";
834         binaryLoop: while (!(range.index >= range.bytes.length))
835         {
836             switch (range.bytes[range.index])
837             {
838             case '0':
839             case '1':
840             case '_':
841                 version (iasm64NotWindows)
842                 {
843                     if (haveSSE42 && range.index + 16 < range.bytes.length)
844                     {
845                         immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')(
846                             range.bytes.ptr + range.index);
847                         range.column += i;
848                         range.index += i;
849                     }
850                     else
851                         range.popFront();
852                 }
853                 else
854                     range.popFront();
855                 break;
856             case 'u':
857             case 'U':
858             case 'L':
859                 lexIntSuffix(type);
860                 break binaryLoop;
861             default:
862                 break binaryLoop;
863             }
864         }
865         token = Token(type, cache.intern(range.slice(mark)), line, column,
866             index);
867     }
868 
869     void lexDecimal(ref Token token)
870     {
871         mixin (tokenStart);
872         lexDecimal(token, mark, line, column, index);
873     }
874 
875     void lexDecimal(ref Token token, size_t mark, size_t line, size_t column,
876         size_t index) @trusted
877     {
878         bool foundDot = range.bytes[range.index] == '.';
879         IdType type = tok!"intLiteral";
880         if (foundDot)
881         {
882             range.popFront();
883             type = tok!"doubleLiteral";
884         }
885 
886         decimalLoop: while (!(range.index >= range.bytes.length))
887         {
888             switch (range.bytes[range.index])
889             {
890             case '0': .. case '9':
891             case '_':
892                 version (iasm64NotWindows)
893                 {
894                     if (haveSSE42 && range.index + 16 < range.bytes.length)
895                     {
896                         ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index);
897                         range.column += i;
898                         range.index += i;
899                     }
900                     else
901                         range.popFront();
902                 }
903                 else
904                     range.popFront();
905                 break;
906             case 'u':
907             case 'U':
908                 if (!foundDot)
909                     lexIntSuffix(type);
910                 break decimalLoop;
911             case 'i':
912                 lexFloatSuffix(type);
913                 break decimalLoop;
914             case 'L':
915                 if (foundDot)
916                     lexFloatSuffix(type);
917                 else
918                     lexIntSuffix(type);
919                 break decimalLoop;
920             case 'f':
921             case 'F':
922                 lexFloatSuffix(type);
923                 break decimalLoop;
924             case 'e':
925             case 'E':
926                 lexExponent(type);
927                 break decimalLoop;
928             case '.':
929                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
930                     break decimalLoop;
931                 else
932                 {
933                     // The following bit of silliness tries to tell the
934                     // difference between "int dot identifier" and
935                     // "double identifier".
936                     if ((range.index + 1 < range.bytes.length))
937                     {
938                         auto ch = range.peekAt(1);
939                         if (ch <= 0x2f
940                             || (ch >= '0' && ch <= '9')
941                             || (ch >= ':' && ch <= '@')
942                             || (ch >= '[' && ch <= '^')
943                             || (ch >= '{' && ch <= '~')
944                             || ch == '`' || ch == '_')
945                         {
946                             goto doubleLiteral;
947                         }
948                         else
949                             break decimalLoop;
950                     }
951                     else
952                     {
953                     doubleLiteral:
954                         range.popFront();
955                         foundDot = true;
956                         type = tok!"doubleLiteral";
957                     }
958                 }
959                 break;
960             default:
961                 break decimalLoop;
962             }
963         }
964         token = Token(type, cache.intern(range.slice(mark)), line, column,
965             index);
966     }
967 
968     void lexIntSuffix(ref IdType type)
969     {
970         bool secondPass;
971         if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U')
972         {
973     U:
974             if (type == tok!"intLiteral")
975                 type = tok!"uintLiteral";
976             else
977                 type = tok!"ulongLiteral";
978             range.popFront();
979             if (secondPass)
980                 return;
981             if (range.index < range.bytes.length && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l'))
982                 goto L;
983             return;
984         }
985         if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')
986         {
987     L:
988             if (type == tok!"uintLiteral")
989                 type = tok!"ulongLiteral";
990             else
991                 type = tok!"longLiteral";
992             range.popFront();
993             if (!secondPass && range.index < range.bytes.length && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u'))
994             {
995                 secondPass = true;
996                 goto U;
997             }
998             return;
999         }
1000     }
1001 
1002     void lexFloatSuffix(ref IdType type) pure nothrow @safe
1003     {
1004         switch (range.bytes[range.index])
1005         {
1006         case 'L':
1007             range.popFront();
1008             type = tok!"doubleLiteral";
1009             break;
1010         case 'f':
1011         case 'F':
1012             range.popFront();
1013             type = tok!"floatLiteral";
1014             break;
1015         default:
1016             break;
1017         }
1018         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == 'i')
1019         {
1020             warning("Complex number literals are deprecated");
1021             range.popFront();
1022             if (type == tok!"floatLiteral")
1023                 type = tok!"ifloatLiteral";
1024             else
1025                 type = tok!"idoubleLiteral";
1026         }
1027     }
1028 
1029     void lexExponent(ref IdType type) pure nothrow @safe
1030     {
1031         range.popFront();
1032         bool foundSign = false;
1033         bool foundDigit = false;
1034         while (!(range.index >= range.bytes.length))
1035         {
1036             switch (range.bytes[range.index])
1037             {
1038             case '-':
1039             case '+':
1040                 if (foundSign)
1041                 {
1042                     if (!foundDigit)
1043                     error("Expected an exponent");
1044                     return;
1045                 }
1046                 foundSign = true;
1047                 range.popFront();
1048                 break;
1049             case '0': .. case '9':
1050             case '_':
1051                 foundDigit = true;
1052                 range.popFront();
1053                 break;
1054             case 'L':
1055             case 'f':
1056             case 'F':
1057             case 'i':
1058                 lexFloatSuffix(type);
1059                 return;
1060             default:
1061                 if (!foundDigit)
1062                     error("Expected an exponent");
1063                 return;
1064             }
1065         }
1066     }
1067 
1068     void lexScriptLine(ref Token token)
1069     {
1070         mixin (tokenStart);
1071         while (!(range.index >= range.bytes.length) && !isNewline)
1072         {
1073             range.popFront();
1074         }
1075         token = Token(tok!"scriptLine", cache.intern(range.slice(mark)),
1076             line, column, index);
1077     }
1078 
1079     void lexSpecialTokenSequence(ref Token token)
1080     {
1081         mixin (tokenStart);
1082         while (!(range.index >= range.bytes.length) && !isNewline)
1083         {
1084             range.popFront();
1085         }
1086         token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
1087             line, column, index);
1088     }
1089 
1090     void lexSlashStarComment(ref Token token) @trusted
1091     {
1092         mixin (tokenStart);
1093         IdType type = tok!"comment";
1094         range.popFrontN(2);
1095         while (range.index < range.bytes.length)
1096         {
1097             version (iasm64NotWindows)
1098             {
1099                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1100                     skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index,
1101                         &range.index, &range.column);
1102             }
1103             if (range.bytes[range.index] == '*')
1104             {
1105                 range.popFront();
1106                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1107                 {
1108                     range.popFront();
1109                     break;
1110                 }
1111             }
1112             else
1113                 popFrontWhitespaceAware();
1114         }
1115     end:
1116         token = Token(type, cache.intern(range.slice(mark)), line, column,
1117             index);
1118     }
1119 
1120     void lexSlashSlashComment(ref Token token) @trusted
1121     {
1122         mixin (tokenStart);
1123         IdType type = tok!"comment";
1124         range.popFrontN(2);
1125         while (range.index < range.bytes.length)
1126         {
1127             version (iasm64NotWindows)
1128             {
1129                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1130                 {
1131                     skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1132                         &range.index, &range.column);
1133                 }
1134             }
1135             if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n')
1136                 break;
1137             range.popFront();
1138         }
1139     end:
1140         token =  Token(type, cache.intern(range.slice(mark)), line, column,
1141             index);
1142     }
1143 
1144     void lexSlashPlusComment(ref Token token) @trusted
1145     {
1146         mixin (tokenStart);
1147         IdType type = tok!"comment";
1148         range.index += 2;
1149         range.column += 2;
1150         int depth = 1;
1151         while (depth > 0 && !(range.index >= range.bytes.length))
1152         {
1153             version (iasm64NotWindows)
1154             {
1155                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1156                 {
1157                     skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1158                         &range.index, &range.column);
1159                 }
1160             }
1161             if (range.bytes[range.index] == '+')
1162             {
1163                 range.popFront();
1164                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1165                 {
1166                     range.popFront();
1167                     depth--;
1168                 }
1169             }
1170             else if (range.bytes[range.index] == '/')
1171             {
1172                 range.popFront();
1173                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+')
1174                 {
1175                     range.popFront();
1176                     depth++;
1177                 }
1178             }
1179             else
1180                 popFrontWhitespaceAware();
1181         }
1182         token = Token(type, cache.intern(range.slice(mark)), line, column,
1183             index);
1184     }
1185 
1186     void lexStringLiteral(ref Token token) @trusted
1187     {
1188         mixin (tokenStart);
1189         range.popFront();
1190         while (true)
1191         {
1192             if ((range.index >= range.bytes.length))
1193             {
1194                 error("Error: unterminated string literal");
1195                 token = Token(tok!"");
1196                 return;
1197             }
1198             version (iasm64NotWindows)
1199             {
1200                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1201                 {
1202                     skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1203                         &range.index, &range.column);
1204                 }
1205             }
1206             if (range.bytes[range.index] == '"')
1207             {
1208                 range.popFront();
1209                 break;
1210             }
1211             else if (range.bytes[range.index] == '\\')
1212             {
1213                 lexEscapeSequence();
1214             }
1215             else
1216                 popFrontWhitespaceAware();
1217         }
1218         IdType type = tok!"stringLiteral";
1219         lexStringSuffix(type);
1220         token = Token(type, cache.intern(range.slice(mark)), line, column,
1221             index);
1222     }
1223 
1224     void lexWysiwygString(ref Token token) @trusted
1225     {
1226         mixin (tokenStart);
1227         IdType type = tok!"stringLiteral";
1228         bool backtick = range.bytes[range.index] == '`';
1229         if (backtick)
1230         {
1231             range.popFront();
1232             while (true)
1233             {
1234                 if ((range.index >= range.bytes.length))
1235                 {
1236                     error("Error: unterminated string literal");
1237                     token = Token(tok!"");
1238                     return;
1239                 }
1240                 version (iasm64NotWindows)
1241                 {
1242                     if (haveSSE42 && range.index + 16 < range.bytes.length)
1243                     {
1244                         skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index,
1245                             &range.index, &range.column);
1246                     }
1247                 }
1248                 if (range.bytes[range.index] == '`')
1249                 {
1250                     range.popFront();
1251                     break;
1252                 }
1253                 else
1254                     popFrontWhitespaceAware();
1255             }
1256         }
1257         else
1258         {
1259             range.popFront();
1260             if ((range.index >= range.bytes.length))
1261             {
1262                 error("Error: unterminated string literal");
1263                 token = Token(tok!"");
1264                 return;
1265             }
1266             range.popFront();
1267             while (true)
1268             {
1269                 if ((range.index >= range.bytes.length))
1270                 {
1271                     error("Error: unterminated string literal");
1272                     token = Token(tok!"");
1273                     return;
1274                 }
1275                 else if (range.bytes[range.index] == '"')
1276                 {
1277                     range.popFront();
1278                     break;
1279                 }
1280                 else
1281                     popFrontWhitespaceAware();
1282             }
1283         }
1284         lexStringSuffix(type);
1285         token = Token(type, cache.intern(range.slice(mark)), line, column,
1286             index);
1287     }
1288 
1289     private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe
1290     {
1291         if ((range.index >= range.bytes.length))
1292         {
1293             type = tok!"stringLiteral";
1294             return 0;
1295         }
1296         else
1297         {
1298             switch (range.bytes[range.index])
1299             {
1300             case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w';
1301             case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd';
1302             case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c';
1303             default: type = tok!"stringLiteral"; return 0;
1304             }
1305         }
1306     }
1307 
1308     void lexDelimitedString(ref Token token)
1309     {
1310         mixin (tokenStart);
1311         range.index += 2;
1312         range.column += 2;
1313         ubyte open;
1314         ubyte close;
1315         switch (range.bytes[range.index])
1316         {
1317         case '<':
1318             open = '<';
1319             close = '>';
1320             range.popFront();
1321             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1322             break;
1323         case '{':
1324             open = '{';
1325             close = '}';
1326             range.popFront();
1327             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1328             break;
1329         case '[':
1330             open = '[';
1331             close = ']';
1332             range.popFront();
1333             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1334             break;
1335         case '(':
1336             open = '(';
1337             close = ')';
1338             range.popFront();
1339             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1340             break;
1341         default:
1342             lexHeredocString(token, mark, line, column, index);
1343             break;
1344         }
1345     }
1346 
1347     void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column,
1348         size_t index, ubyte open, ubyte close)
1349     {
1350         int depth = 1;
1351         while (!(range.index >= range.bytes.length) && depth > 0)
1352         {
1353             if (range.bytes[range.index] == open)
1354             {
1355                 depth++;
1356                 range.popFront();
1357             }
1358             else if (range.bytes[range.index] == close)
1359             {
1360                 depth--;
1361                 range.popFront();
1362                 if (depth <= 0)
1363                 {
1364                     if (range.bytes[range.index] == '"')
1365                     {
1366                         range.popFront();
1367                     }
1368                     else
1369                     {
1370                         error("Error: \" expected to end delimited string literal");
1371                         token = Token(tok!"");
1372                         return;
1373                     }
1374                 }
1375             }
1376             else
1377                 popFrontWhitespaceAware();
1378         }
1379         IdType type = tok!"stringLiteral";
1380         lexStringSuffix(type);
1381         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1382     }
1383 
1384     void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index)
1385     {
1386         Token ident;
1387         lexIdentifier(ident);
1388         if (isNewline())
1389             popFrontWhitespaceAware();
1390         else
1391             error("Newline expected");
1392         while (!(range.index >= range.bytes.length))
1393         {
1394             if (isNewline())
1395             {
1396                 popFrontWhitespaceAware();
1397                 if (!range.canPeek(ident.text.length))
1398                 {
1399                     error(ident.text ~ " expected");
1400                     break;
1401                 }
1402                 if (range.peek(ident.text.length - 1) == ident.text)
1403                 {
1404                     range.popFrontN(ident.text.length);
1405                     break;
1406                 }
1407             }
1408             else
1409             {
1410                 range.popFront();
1411             }
1412         }
1413         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"')
1414         {
1415             range.popFront();
1416         }
1417         else
1418             error(`" expected`);
1419         IdType type = tok!"stringLiteral";
1420         lexStringSuffix(type);
1421         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1422     }
1423 
1424     void lexTokenString(ref Token token)
1425     {
1426         mixin (tokenStart);
1427         assert (range.bytes[range.index] == 'q');
1428         range.popFront();
1429         assert (range.bytes[range.index] == '{');
1430         range.popFront();
1431         auto app = appender!string();
1432         app.put("q{");
1433         int depth = 1;
1434 
1435         immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior;
1436         immutable StringBehavior oldString = config.stringBehavior;
1437         config.whitespaceBehavior = WhitespaceBehavior.include;
1438         config.stringBehavior = StringBehavior.source;
1439         scope (exit)
1440         {
1441             config.whitespaceBehavior = oldWhitespace;
1442             config.stringBehavior = oldString;
1443         }
1444 
1445         advance(_front);
1446         while (depth > 0 && !empty)
1447         {
1448             auto t = front();
1449             if (t.text is null)
1450                 app.put(str(t.type));
1451             else
1452                 app.put(t.text);
1453             if (t.type == tok!"}")
1454             {
1455                 depth--;
1456                 if (depth > 0)
1457                 popFront();
1458             }
1459             else if (t.type == tok!"{")
1460             {
1461                 depth++;
1462                 popFront();
1463             }
1464             else
1465                 popFront();
1466         }
1467         IdType type = tok!"stringLiteral";
1468         auto b = lexStringSuffix(type);
1469         if (b != 0)
1470             app.put(b);
1471         token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
1472             column, index);
1473     }
1474 
1475     void lexHexString(ref Token token)
1476     {
1477         mixin (tokenStart);
1478         range.index += 2;
1479         range.column += 2;
1480 
1481         loop: while (true)
1482         {
1483             if ((range.index >= range.bytes.length))
1484             {
1485                 error("Error: unterminated hex string literal");
1486                 token = Token(tok!"");
1487                 return;
1488             }
1489             else if (isWhitespace())
1490                 popFrontWhitespaceAware();
1491             else switch (range.bytes[range.index])
1492             {
1493             case '0': .. case '9':
1494             case 'A': .. case 'F':
1495             case 'a': .. case 'f':
1496                 range.popFront();
1497                 break;
1498             case '"':
1499                 range.popFront();
1500                 break loop;
1501             default:
1502                 error("Error: invalid character in hex string");
1503                 token = Token(tok!"");
1504                 return;
1505             }
1506         }
1507 
1508         IdType type = tok!"stringLiteral";
1509         lexStringSuffix(type);
1510         token = Token(type, cache.intern(range.slice(mark)), line, column,
1511             index);
1512     }
1513 
1514     bool lexEscapeSequence()
1515     {
1516         range.popFront();
1517         if ((range.index >= range.bytes.length))
1518         {
1519             error("Error: non-terminated character escape sequence.");
1520             return false;
1521         }
1522         switch (range.bytes[range.index])
1523         {
1524         case '\'':
1525         case '"':
1526         case '?':
1527         case '\\':
1528         case 'a':
1529         case 'b':
1530         case 'f':
1531         case 'n':
1532         case 'r':
1533         case 't':
1534         case 'v':
1535             range.popFront();
1536             break;
1537         case 'x':
1538             range.popFront();
1539             foreach (i; 0 .. 2)
1540             {
1541                 if ((range.index >= range.bytes.length))
1542                 {
1543                     error("Error: 2 hex digits expected.");
1544                     return false;
1545                 }
1546                 switch (range.bytes[range.index])
1547                 {
1548                 case '0': .. case '9':
1549                 case 'a': .. case 'f':
1550                 case 'A': .. case 'F':
1551                     range.popFront();
1552                     break;
1553                 default:
1554                     error("Error: 2 hex digits expected.");
1555                     return false;
1556                 }
1557             }
1558             break;
1559         case '0':
1560             if (!(range.index + 1 < range.bytes.length) || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\''))
1561             {
1562                 range.popFront();
1563                 break;
1564             }
1565             goto case;
1566         case '1': .. case '7':
1567             for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++)
1568                 range.popFront();
1569             break;
1570         case 'u':
1571             range.popFront();
1572             foreach (i; 0 .. 4)
1573             {
1574                 if ((range.index >= range.bytes.length))
1575                 {
1576                     error("Error: at least 4 hex digits expected.");
1577                     return false;
1578                 }
1579                 switch (range.bytes[range.index])
1580                 {
1581                 case '0': .. case '9':
1582                 case 'a': .. case 'f':
1583                 case 'A': .. case 'F':
1584                     range.popFront();
1585                     break;
1586                 default:
1587                     error("Error: at least 4 hex digits expected.");
1588                     return false;
1589                 }
1590             }
1591             break;
1592         case 'U':
1593             range.popFront();
1594             foreach (i; 0 .. 8)
1595             {
1596                 if ((range.index >= range.bytes.length))
1597                 {
1598                     error("Error: at least 8 hex digits expected.");
1599                     return false;
1600                 }
1601                 switch (range.bytes[range.index])
1602                 {
1603                 case '0': .. case '9':
1604                 case 'a': .. case 'f':
1605                 case 'A': .. case 'F':
1606                     range.popFront();
1607                     break;
1608                 default:
1609                     error("Error: at least 8 hex digits expected.");
1610                     return false;
1611                 }
1612             }
1613             break;
1614         default:
1615             while (true)
1616             {
1617                 if ((range.index >= range.bytes.length))
1618                 {
1619                     error("Error: non-terminated character escape sequence.");
1620                     return false;
1621                 }
1622                 if (range.bytes[range.index] == ';')
1623                 {
1624                     range.popFront();
1625                     break;
1626                 }
1627                 else
1628                 {
1629                     range.popFront();
1630                 }
1631             }
1632         }
1633         return true;
1634     }
1635 
1636     void lexCharacterLiteral(ref Token token)
1637     {
1638         mixin (tokenStart);
1639         range.popFront();
1640         if (range.bytes[range.index] == '\\')
1641         {
1642             lexEscapeSequence();
1643             goto close;
1644         }
1645         else if (range.bytes[range.index] == '\'')
1646         {
1647             range.popFront();
1648             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1649                 line, column, index);
1650         }
1651         else if (range.bytes[range.index] & 0x80)
1652         {
1653             while (range.bytes[range.index] & 0x80)
1654             {
1655                 range.popFront();
1656             }
1657             goto close;
1658         }
1659         else
1660         {
1661             popFrontWhitespaceAware();
1662             goto close;
1663         }
1664     close:
1665         if (range.bytes[range.index] == '\'')
1666         {
1667             range.popFront();
1668             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1669                 line, column, index);
1670         }
1671         else
1672         {
1673             error("Error: Expected ' to end character literal");
1674             token = Token(tok!"");
1675         }
1676     }
1677 
1678     void lexIdentifier(ref Token token) @trusted
1679     {
1680         mixin (tokenStart);
1681         if (isSeparating(0))
1682         {
1683             error("Invalid identifier");
1684             range.popFront();
1685         }
1686         while (true)
1687         {
1688             version (iasm64NotWindows)
1689             {
1690                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1691                 {
1692                     immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_')
1693                         (range.bytes.ptr + range.index);
1694                     range.column += i;
1695                     range.index += i;
1696                 }
1697             }
1698             if (isSeparating(0))
1699                 break;
1700             else
1701                 range.popFront();
1702         }
1703         token = Token(tok!"identifier", cache.intern(range.slice(mark)), line,
1704             column, index);
1705     }
1706 
1707     void lexDot(ref Token token)
1708     {
1709         mixin (tokenStart);
1710         if (!(range.index + 1 < range.bytes.length))
1711         {
1712             range.popFront();
1713             token = Token(tok!".", null, line, column, index);
1714             return;
1715         }
1716         switch (range.peekAt(1))
1717         {
1718         case '0': .. case '9':
1719             lexNumber(token);
1720             return;
1721         case '.':
1722             range.popFront();
1723             range.popFront();
1724             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.')
1725             {
1726                 range.popFront();
1727                 token = Token(tok!"...", null, line, column, index);
1728             }
1729             else
1730                 token = Token(tok!"..", null, line, column, index);
1731             return;
1732         default:
1733             range.popFront();
1734             token = Token(tok!".", null, line, column, index);
1735             return;
1736         }
1737     }
1738 
1739     void lexLongNewline(ref Token token) @nogc
1740     {
1741         mixin (tokenStart);
1742         range.popFront();
1743         range.popFront();
1744         range.popFront();
1745         range.incrementLine();
1746         string text = config.whitespaceBehavior == WhitespaceBehavior.include
1747             ? cache.intern(range.slice(mark)) : "";
1748         token = Token(tok!"whitespace", text, line,
1749             column, index);
1750     }
1751 
1752     bool isNewline() @nogc
1753     {
1754         if (range.bytes[range.index] == '\n') return true;
1755         if (range.bytes[range.index] == '\r') return true;
1756         return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length)
1757             && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
1758     }
1759 
1760     bool isSeparating(size_t offset) @nogc
1761     {
1762         enum : ubyte
1763         {
1764             n, y, m // no, yes, maybe
1765         }
1766 
1767         if (range.index + offset >= range.bytes.length)
1768             return true;
1769         auto c = range.bytes[range.index + offset];
1770         static immutable ubyte[256] LOOKUP_TABLE = [
1771             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1772             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1773             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1774             n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y,
1775             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1776             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n,
1777             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1778             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y,
1779             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1780             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1781             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1782             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1783             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1784             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1785             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1786             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m
1787         ];
1788         immutable ubyte result = LOOKUP_TABLE[c];
1789         if (result == n)
1790             return false;
1791         if (result == y)
1792             return true;
1793         if (result == m)
1794         {
1795             auto r = range;
1796             range.popFrontN(offset);
1797             return (r.canPeek(2) && (r.peek(2) == "\u2028"
1798                 || r.peek(2) == "\u2029"));
1799         }
1800         assert (false);
1801     }
1802 
1803 
1804 
1805     enum tokenStart = q{
1806         size_t index = range.index;
1807         size_t column = range.column;
1808         size_t line = range.line;
1809         auto mark = range.mark();
1810     };
1811 
1812     void error(string message)
1813     {
1814         messages ~= Message(range.line, range.column, message, true);
1815     }
1816 
1817     void warning(string message)
1818     {
1819         messages ~= Message(range.line, range.column, message, false);
1820         assert (messages.length > 0);
1821     }
1822 
1823     static struct Message
1824     {
1825         size_t line;
1826         size_t column;
1827         string message;
1828         bool isError;
1829     }
1830 
1831     Message[] messages;
1832     StringCache* cache;
1833     LexerConfig config;
1834     bool haveSSE42;
1835 }
1836 
1837 /**
1838  * Creates a token range from the given source code. Creates a default lexer
1839  * configuration and a GC-managed string cache.
1840  */
1841 public auto byToken(ubyte[] range)
1842 {
1843     LexerConfig config;
1844     StringCache* cache = new StringCache(StringCache.defaultBucketCount);
1845     return DLexer(range, config, cache);
1846 }
1847 
1848 /**
1849  * Creates a token range from the given source code. Uses the given string
1850  * cache.
1851  */
1852 public auto byToken(ubyte[] range, StringCache* cache)
1853 {
1854     LexerConfig config;
1855     return DLexer(range, config, cache);
1856 }
1857 
1858 /**
1859  * Creates a token range from the given source code. Uses the provided lexer
1860  * configuration and string cache.
1861  */
1862 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
1863 {
1864     return DLexer(range, config, cache);
1865 }
1866 
1867 /**
1868  * Removes "decoration" such as leading whitespace, leading + and * characters,
1869  * and places the result into the given output range
1870  */
1871 public void unDecorateComment(T)(string comment, auto ref T outputRange)
1872     if (isOutputRange!(T, string))
1873 in
1874 {
1875     assert (comment.length >= 3);
1876 }
1877 body
1878 {
1879     switch (comment[0 .. 3])
1880     {
1881     case "///":
1882         size_t i = 3;
1883         if (i < comment.length)
1884         {
1885         again:
1886             while (i < comment.length && (comment[i] == ' ' || comment[i] == '\t'))
1887                 i++;
1888             size_t j = i + 1;
1889             while (j < comment.length)
1890             {
1891                 if (comment[j] == '\r')
1892                     j++;
1893                 if (j >= comment.length)
1894                     break;
1895                 if (comment[j] == '\n')
1896                 {
1897                     outputRange.put(comment[i .. j]);
1898                     j++;
1899                     while (j < comment.length && comment[j] == '/')
1900                         j++;
1901                     outputRange.put('\n');
1902                     i = j;
1903                     goto again;
1904                 }
1905                 j++;
1906             }
1907             if (i < comment.length && j <= comment.length)
1908                 outputRange.put(comment[i .. j]);
1909         }
1910         break;
1911     case "/++":
1912     case "/**":
1913         size_t i = 3;
1914         immutable char c = comment[1];
1915         // Skip leading * and + characters
1916         while (comment[i] == c) i++;
1917         // Skip trailing * and + characters
1918         size_t j = comment.length - 2;
1919         while (j > i && comment[j] == c)
1920             j--;
1921         while (j > i && (comment[j] == ' ' || comment[j] == '\t'))
1922             j--;
1923         j++;
1924         size_t k = i;
1925         while (k < j)
1926         {
1927             if (comment[k] == '\n')
1928             {
1929                 k++;
1930                 break;
1931             }
1932             k++;
1933         }
1934         outputRange.put(comment[i .. k]);
1935         i = k;
1936         if (comment[i] == '\r') i++;
1937         if (comment[i] == '\n') i++;
1938         while (comment[i] == ' ' || comment[i] == '\t') i++;
1939         immutable bool skipBeginningChar = comment[i] == c;
1940         if (skipBeginningChar)
1941             i++;
1942         size_t whitespaceToSkip;
1943         while (comment[i] == ' ' || comment[i] == '\t')
1944         {
1945             whitespaceToSkip++;
1946             i++;
1947         }
1948         size_t l = i;
1949         while (i < j)
1950         {
1951             if (comment[i++] == '\n')
1952                 break;
1953         }
1954         outputRange.put(comment[l .. i]);
1955         while (true)
1956         {
1957             if (skipBeginningChar)
1958             {
1959                 while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++;
1960                 if (i < j && comment[i] == c) i++;
1961             }
1962             for (size_t s = 0; (i < j) && (s < whitespaceToSkip)
1963                 && (comment[i] == ' ' || comment[i] == '\t');)
1964             {
1965                 s++;
1966                 i++;
1967             }
1968             k = i;
1969             inner: while (k < j)
1970             {
1971                 if (comment[k] == '\n')
1972                 {
1973                     k++;
1974                     break inner;
1975                 }
1976                 k++;
1977             }
1978             outputRange.put(comment[i .. k]);
1979             i = k;
1980             if (i >= j)
1981                 break;
1982         }
1983         break;
1984     default:
1985         outputRange.put(comment);
1986         break;
1987     }
1988 }
1989 
1990 
1991 /**
1992  * The string cache is used for string interning.
1993  *
1994  * It will only store a single copy of any string that it is asked to hold.
1995  * Interned strings can be compared for equality by comparing their $(B .ptr)
1996  * field.
1997  *
1998  * Default and postbilt constructors are disabled. When a StringCache goes out
1999  * of scope, the memory held by it is freed.
2000  *
2001  * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning)
2002  */
2003 struct StringCache
2004 {
2005 public pure nothrow @nogc:
2006 
2007     @disable this();
2008     @disable this(this);
2009 
2010     /**
2011      * Params: bucketCount = the initial number of buckets. Must be a
2012      * power of two
2013      */
2014     this(size_t bucketCount) nothrow @trusted @nogc
2015     in
2016     {
2017         import core.bitop : popcnt;
2018         static if (size_t.sizeof == 8)
2019         {
2020             immutable low = popcnt(cast(uint) bucketCount);
2021             immutable high = popcnt(cast(uint) (bucketCount >> 32));
2022             assert ((low == 0 && high == 1) || (low == 1 && high == 0));
2023         }
2024         else
2025         {
2026             static assert (size_t.sizeof == 4);
2027             assert (popcnt(cast(uint) bucketCount) == 1);
2028         }
2029     }
2030     body
2031     {
2032         buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
2033     }
2034 
2035     version(none)
2036     ~this()
2037     {
2038         Block* current = rootBlock;
2039         while (current !is null)
2040         {
2041             Block* prev = current;
2042             current = current.next;
2043             free(cast(void*) prev);
2044         }
2045         foreach (nodePointer; buckets)
2046         {
2047             Node* currentNode = nodePointer;
2048             while (currentNode !is null)
2049             {
2050                 if (currentNode.mallocated)
2051                     free(currentNode.str.ptr);
2052                 Node* prev = currentNode;
2053                 currentNode = currentNode.next;
2054                 free(prev);
2055             }
2056         }
2057         rootBlock = null;
2058         free(buckets.ptr);
2059         buckets = null;
2060     }
2061 
2062     /**
2063      * Caches a string.
2064      */
2065     string intern(const(ubyte)[] str) @safe
2066     {
2067         if (str is null || str.length == 0)
2068             return "";
2069         return _intern(str);
2070     }
2071 
2072     /**
2073      * ditto
2074      */
2075     string intern(string str) @trusted
2076     {
2077         return intern(cast(ubyte[]) str);
2078     }
2079 
2080     /**
2081      * The default bucket count for the string cache.
2082      */
2083     static enum defaultBucketCount = 4096;
2084 
2085 private:
2086 
2087     string _intern(const(ubyte)[] bytes) @trusted
2088     {
2089         immutable uint hash = hashBytes(bytes);
2090         immutable size_t index = hash & (buckets.length - 1);
2091         Node* s = find(bytes, hash);
2092         if (s !is null)
2093             return cast(string) s.str;
2094         ubyte[] mem = void;
2095         bool mallocated = bytes.length > BIG_STRING;
2096         if (mallocated)
2097             mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length];
2098         else
2099             mem = allocate(bytes.length);
2100         mem[] = bytes[];
2101         Node* node = cast(Node*) malloc(Node.sizeof);
2102         node.str = mem;
2103         node.hash = hash;
2104         node.next = buckets[index];
2105         node.mallocated = mallocated;
2106         buckets[index] = node;
2107         return cast(string) mem;
2108     }
2109 
2110     Node* find(const(ubyte)[] bytes, uint hash) @trusted
2111     {
2112         import std.algorithm : equal;
2113         immutable size_t index = hash & (buckets.length - 1);
2114         Node* node = buckets[index];
2115         while (node !is null)
2116         {
2117             if (node.hash == hash && bytes == cast(ubyte[]) node.str)
2118                 return node;
2119             node = node.next;
2120         }
2121         return node;
2122     }
2123 
2124     static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc
2125     in
2126     {
2127         assert (data !is null);
2128         assert (data.length > 0);
2129     }
2130     body
2131     {
2132         immutable uint m = 0x5bd1e995;
2133         immutable int r = 24;
2134         uint h = cast(uint) data.length;
2135         while (data.length >= 4)
2136         {
2137             uint k = (cast(ubyte) data[3]) << 24
2138                 | (cast(ubyte) data[2]) << 16
2139                 | (cast(ubyte) data[1]) << 8
2140                 | (cast(ubyte) data[0]);
2141             k *= m;
2142             k ^= k >> r;
2143             k *= m;
2144             h *= m;
2145             h ^= k;
2146             data = data[4 .. $];
2147         }
2148         switch (data.length & 3)
2149         {
2150         case 3:
2151             h ^= data[2] << 16;
2152             goto case;
2153         case 2:
2154             h ^= data[1] << 8;
2155             goto case;
2156         case 1:
2157             h ^= data[0];
2158             h *= m;
2159             break;
2160         default:
2161             break;
2162         }
2163         h ^= h >> 13;
2164         h *= m;
2165         h ^= h >> 15;
2166         return h;
2167     }
2168 
2169     ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc
2170     in
2171     {
2172         assert (numBytes != 0);
2173     }
2174     out (result)
2175     {
2176         assert (result.length == numBytes);
2177     }
2178     body
2179     {
2180         Block* r = rootBlock;
2181         size_t i = 0;
2182         while  (i <= 3 && r !is null)
2183         {
2184             immutable size_t available = r.bytes.length;
2185             immutable size_t oldUsed = r.used;
2186             immutable size_t newUsed = oldUsed + numBytes;
2187             if (newUsed <= available)
2188             {
2189                 r.used = newUsed;
2190                 return r.bytes[oldUsed .. newUsed];
2191             }
2192             i++;
2193             r = r.next;
2194         }
2195         Block* b = cast(Block*) calloc(Block.sizeof, 1);
2196         b.used = numBytes;
2197         b.next = rootBlock;
2198         rootBlock = b;
2199         return b.bytes[0 .. numBytes];
2200     }
2201 
2202     static struct Node
2203     {
2204         ubyte[] str = void;
2205         Node* next = void;
2206         uint hash = void;
2207         bool mallocated = void;
2208     }
2209 
2210     static struct Block
2211     {
2212         Block* next;
2213         size_t used;
2214         enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof;
2215         ubyte[BLOCK_CAPACITY] bytes;
2216     }
2217 
2218     static assert (BLOCK_SIZE == Block.sizeof);
2219 
2220     enum BLOCK_SIZE = 1024 * 16;
2221 
2222     // If a string would take up more than 1/4 of a block, allocate it outside
2223     // of the block.
2224     enum BIG_STRING = BLOCK_SIZE / 4;
2225 
2226     Node*[] buckets;
2227     Block* rootBlock;
2228 }
2229 
2230 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted;
2231 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted;
2232 private extern(C) void free(void*) nothrow pure @nogc @trusted;
2233 
2234 unittest
2235 {
2236     auto source = cast(ubyte[]) q{ import std.stdio;}};
2237     auto tokens = getTokensForParser(source, LexerConfig(),
2238         new StringCache(StringCache.defaultBucketCount));
2239     assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
2240         tok!"identifier", tok!";"]));
2241 }
2242 
2243 /// Test \x char sequence
2244 unittest
2245 {
2246     auto toks = (string s) => byToken(cast(ubyte[])s);
2247 
2248     // valid
2249     enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
2250     auto source = "";
2251     foreach (h1; hex)
2252         foreach (h2; hex)
2253             source ~= "'\\x" ~ h1 ~ h2 ~ "'";
2254     assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
2255 
2256     // invalid
2257     assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2258     assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2259     assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2260     assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2261     assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2262 }
2263 
2264 version (iasm64NotWindows)
2265 {
2266     /**
2267      * Returns:
2268      */
2269     ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc
2270     {
2271         asm pure nothrow @nogc
2272         {
2273             naked;
2274             movdqu XMM1, [RDI];
2275             mov RAX, 3;
2276             mov RDX, 16;
2277             mov R8, 0x0d0d0d0d0d0d0d0dL;
2278             movq XMM2, R8;
2279             shufpd XMM2, XMM2, 0;
2280             pcmpeqb XMM2, XMM1;
2281             mov R9, 0x0a0a0a0a0a0a0a0aL;
2282             movq XMM3, R9;
2283             shufpd XMM3, XMM3, 0;
2284             pcmpeqb XMM3, XMM1;
2285             mov R10, 0xe280a8L;
2286             movq XMM4, R10;
2287             pcmpestrm XMM4, XMM1, 0b01001100;
2288             movdqa XMM4, XMM0;
2289             mov R11, 0xe280a9L;
2290             movq XMM5, R11;
2291             pcmpestrm XMM5, XMM1, 0b01001100;
2292             movdqa XMM5, XMM0;
2293             mov RCX, 0x0a0d;
2294             dec RAX;
2295             movq XMM6, RCX;
2296             pcmpestrm XMM6, XMM1, 0b01001100;
2297             movdqa XMM6, XMM0;
2298             movdqa XMM7, XMM6;
2299             pslldq XMM7, 1;
2300             movdqa XMM0, XMM4;
2301             por XMM0, XMM5;
2302             por XMM7, XMM6;
2303             movdqa XMM1, XMM2;
2304             por XMM1, XMM3;
2305             pxor XMM7, XMM1;
2306             por XMM7, XMM0;
2307             por XMM7, XMM6;
2308             pmovmskb RAX, XMM7;
2309             and RAX, 0b0011_1111_1111_1111;
2310             ret;
2311         }
2312     }
2313 
2314     /**
2315      * Skips between 0 and 16 bytes that match (or do not match) one of the
2316      * given $(B chars).
2317      */
2318     void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow
2319         @trusted @nogc if (chars.length <= 8)
2320     {
2321         enum constant = ByteCombine!chars;
2322         enum charsLength = chars.length;
2323         static if (matching)
2324             enum flags = 0b0001_0000;
2325         else
2326             enum flags = 0b0000_0000;
2327         asm pure nothrow @nogc
2328         {
2329             naked;
2330             movdqu XMM1, [RDX];
2331             mov R10, constant;
2332             movq XMM2, R10;
2333             mov RAX, charsLength;
2334             mov RDX, 16;
2335             pcmpestri XMM2, XMM1, flags;
2336             add [RSI], RCX;
2337             add [RDI], RCX;
2338             ret;
2339         }
2340     }
2341 
2342     /**
2343      * Returns: the number of bytes starting at the given location that match
2344      *     (or do not match if $(B invert) is true) the byte ranges in $(B chars).
2345      */
2346     ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc
2347     {
2348         static assert (chars.length % 2 == 0);
2349         enum constant = ByteCombine!chars;
2350         static if (invert)
2351             enum rangeMatchFlags = 0b0000_0100;
2352         else
2353             enum rangeMatchFlags = 0b0001_0100;
2354         enum charsLength = chars.length;
2355         asm pure nothrow @nogc
2356         {
2357             naked;
2358             movdqu XMM1, [RDI];
2359             mov R10, constant;
2360             movq XMM2, R10;
2361             mov RAX, charsLength;
2362             mov RDX, 16;
2363             pcmpestri XMM2, XMM1, rangeMatchFlags;
2364             mov RAX, RCX;
2365             ret;
2366         }
2367     }
2368 
2369     template ByteCombine(c...)
2370     {
2371         static assert (c.length <= 8);
2372         static if (c.length > 1)
2373             enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8);
2374         else
2375             enum ulong ByteCombine = c[0];
2376     }
2377 }