Oleksandr Smirnov
Oleksandr Smirnov
olexsmir@gmail.com refactor: use an actual parser instead of reflection..., 11 days ago
olexsmir@gmail.com refactor: use an actual parser instead of reflection..., 11 days ago
| 1 | package json2go |
| 2 | |
| 3 | import ( |
| 4 | "unicode/utf8" |
| 5 | "unsafe" |
| 6 | ) |
| 7 | |
| 8 | type Lexer struct { |
| 9 | input []byte |
| 10 | ch rune // current rune (0 == EOF) |
| 11 | chSize int // byte size of [ch] |
| 12 | pos int // current byte offset (points at [ch]) |
| 13 | rpos int // next byte offset to read (one ahead of [pos]) |
| 14 | col int // current column (1-based) |
| 15 | line int // current line (1-based) |
| 16 | } |
| 17 | |
| 18 | func NewLexer(input []byte) *Lexer { |
| 19 | l := &Lexer{input: input, line: 1} |
| 20 | l.advance() |
| 21 | if l.ch == '\uFEFF' { // start of the input |
| 22 | l.advance() |
| 23 | } |
| 24 | return l |
| 25 | } |
| 26 | |
| 27 | // Next returns the next token from the input. |
| 28 | // Returns EOF when input is exhausted. |
| 29 | func (l *Lexer) Next() Token { |
| 30 | switch { |
| 31 | case l.ch == 0: |
| 32 | return Token{EOF, ""} |
| 33 | case l.ch == '\n', l.ch == '\r': |
| 34 | l.advance() |
| 35 | return Token{NEWLINE, "\n"} |
| 36 | case l.ch == ' ', l.ch == '\t': |
| 37 | offset := l.pos |
| 38 | for l.ch == ' ' || l.ch == '\t' { |
| 39 | l.advance() |
| 40 | } |
| 41 | return Token{INDENT, sliceString(l.input, offset, l.pos)} |
| 42 | case l.ch == '/': |
| 43 | return l.lexComment() |
| 44 | case l.ch == '"': |
| 45 | return l.lexString() |
| 46 | case l.ch == ':': |
| 47 | l.advance() |
| 48 | return Token{COLON, ":"} |
| 49 | case l.ch == ',': |
| 50 | l.advance() |
| 51 | return Token{COMMA, ","} |
| 52 | case l.ch == '[': |
| 53 | l.advance() |
| 54 | return Token{LBRACKET, "["} |
| 55 | case l.ch == ']': |
| 56 | l.advance() |
| 57 | return Token{RBRACKET, "]"} |
| 58 | case l.ch == '{': |
| 59 | l.advance() |
| 60 | return Token{LBRACE, "{"} |
| 61 | case l.ch == '}': |
| 62 | l.advance() |
| 63 | return Token{RBRACE, "}"} |
| 64 | case l.isDigit(), l.ch == '-': |
| 65 | return l.lexNumber() |
| 66 | case l.isAlpha(): |
| 67 | offset := l.pos |
| 68 | for l.isAlpha() { |
| 69 | l.advance() |
| 70 | } |
| 71 | lit := sliceString(l.input, offset, l.pos) |
| 72 | kind := ILLEGAL |
| 73 | switch lit { |
| 74 | case "false", "true": |
| 75 | kind = BOOL |
| 76 | case "null": |
| 77 | kind = NULL |
| 78 | } |
| 79 | return Token{kind, lit} |
| 80 | } |
| 81 | ch := l.ch |
| 82 | l.advance() |
| 83 | return Token{ILLEGAL, string(ch)} |
| 84 | } |
| 85 | |
| 86 | func (l *Lexer) lexString() Token { |
| 87 | l.advance() |
| 88 | offset := l.pos |
| 89 | for { |
| 90 | switch l.ch { |
| 91 | default: |
| 92 | l.advance() |
| 93 | case 0, '\r', '\n': |
| 94 | return Token{ILLEGAL, "unterminated string"} |
| 95 | case '\\': |
| 96 | l.advance() // consume '\' |
| 97 | switch l.ch { |
| 98 | case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': |
| 99 | l.advance() |
| 100 | case 'u': |
| 101 | l.advance() |
| 102 | for range 4 { // expect exactly 4 hex digits |
| 103 | if !l.isHex() { |
| 104 | return Token{ILLEGAL, "invalid unicode escape"} |
| 105 | } |
| 106 | l.advance() |
| 107 | } |
| 108 | default: |
| 109 | return Token{ILLEGAL, "invalid escape sequence"} |
| 110 | } |
| 111 | case '"': |
| 112 | lit := sliceString(l.input, offset, l.pos) |
| 113 | l.advance() // consume closing '"' |
| 114 | return Token{STRING, lit} |
| 115 | } |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | func (l *Lexer) lexNumber() Token { |
| 120 | offset := l.pos |
| 121 | |
| 122 | if l.ch == '-' { // optional leading minus |
| 123 | l.advance() |
| 124 | } |
| 125 | |
| 126 | // integer part |
| 127 | if l.ch == '0' { |
| 128 | l.advance() |
| 129 | if l.isDigit() { // leading zero must not be followed by another digit |
| 130 | return Token{ILLEGAL, "leading zero in number"} |
| 131 | } |
| 132 | } else if l.isDigit() { |
| 133 | for l.isDigit() { |
| 134 | l.advance() |
| 135 | } |
| 136 | } else { |
| 137 | return Token{ILLEGAL, "invalid number"} |
| 138 | } |
| 139 | |
| 140 | kind := NUMBER |
| 141 | if l.ch == '.' { // optional fractional part |
| 142 | kind = DECIMAL |
| 143 | l.advance() |
| 144 | if !l.isDigit() { |
| 145 | return Token{ILLEGAL, "expected digit after decimal point"} |
| 146 | } |
| 147 | for l.isDigit() { |
| 148 | l.advance() |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | if l.ch == 'e' || l.ch == 'E' { // optional exponent |
| 153 | kind = DECIMAL |
| 154 | l.advance() |
| 155 | if l.ch == '+' || l.ch == '-' { |
| 156 | l.advance() |
| 157 | } |
| 158 | if !l.isDigit() { |
| 159 | return Token{ILLEGAL, "expected digit in exponent"} |
| 160 | } |
| 161 | for l.isDigit() { |
| 162 | l.advance() |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | return Token{kind, sliceString(l.input, offset, l.pos)} |
| 167 | } |
| 168 | |
| 169 | func (l *Lexer) lexComment() Token { |
| 170 | l.advance() |
| 171 | switch l.ch { |
| 172 | default: |
| 173 | return Token{ILLEGAL, "invalid comment"} |
| 174 | case '/': |
| 175 | l.advance() |
| 176 | offset := l.pos |
| 177 | for l.ch != 0 && l.ch != '\n' && l.ch != '\r' { |
| 178 | l.advance() |
| 179 | } |
| 180 | return Token{COMMENTLINE, sliceString(l.input, offset, l.pos)} |
| 181 | case '*': |
| 182 | l.advance() |
| 183 | offset := l.pos |
| 184 | for { |
| 185 | if l.ch == 0 { |
| 186 | return Token{ILLEGAL, "unterminated block comment"} |
| 187 | } |
| 188 | if l.ch == '*' { |
| 189 | l.advance() |
| 190 | if l.ch == '/' { |
| 191 | end := l.pos - 1 // exclude the '*' |
| 192 | l.advance() |
| 193 | return Token{COMMENTBLOCK, sliceString(l.input, offset, end)} |
| 194 | } |
| 195 | } else { |
| 196 | l.advance() |
| 197 | } |
| 198 | } |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | func (l *Lexer) advance() { |
| 203 | if l.rpos >= len(l.input) { |
| 204 | l.ch = 0 |
| 205 | l.chSize = 0 |
| 206 | } else { |
| 207 | l.ch, l.chSize = utf8.DecodeRune(l.input[l.rpos:]) |
| 208 | } |
| 209 | l.pos = l.rpos |
| 210 | l.rpos += l.chSize |
| 211 | if l.ch == '\n' || l.ch == '\r' { |
| 212 | l.line++ |
| 213 | l.col = 0 |
| 214 | } else { |
| 215 | l.col++ |
| 216 | } |
| 217 | } |
| 218 | func (l *Lexer) isDigit() bool { return l.ch >= '0' && l.ch <= '9' } |
| 219 | func (l *Lexer) isAlpha() bool { |
| 220 | return (l.ch >= 'a' && l.ch <= 'z') || (l.ch >= 'A' && l.ch <= 'Z') |
| 221 | } |
| 222 | |
| 223 | func (l *Lexer) isHex() bool { |
| 224 | return (l.ch >= '0' && l.ch <= '9') || |
| 225 | (l.ch >= 'a' && l.ch <= 'f') || (l.ch >= 'A' && l.ch <= 'F') |
| 226 | } |
| 227 | |
| 228 | func sliceString(b []byte, start, end int) string { |
| 229 | if start >= end { |
| 230 | return "" |
| 231 | } |
| 232 | return unsafe.String(&b[start], end-start) |
| 233 | } |