all repos

json2go @ fc9c7dff8393608d5d61f0e8f5274ad0904c93c4

convert json to go type annotations

json2go/lexer.go (view raw)

Oleksandr Smirnov Oleksandr Smirnov
olexsmir@gmail.com
refactor: use an actual parser instead of reflection..., 11 days ago
1
package json2go
2
3
import (
4
	"unicode/utf8"
5
	"unsafe"
6
)
7
8
type Lexer struct {
9
	input  []byte
10
	ch     rune // current rune (0 == EOF)
11
	chSize int  // byte size of [ch]
12
	pos    int  // current byte offset (points at [ch])
13
	rpos   int  // next byte offset to read (one ahead of [pos])
14
	col    int  // current column (1-based)
15
	line   int  // current line (1-based)
16
}
17
18
func NewLexer(input []byte) *Lexer {
19
	l := &Lexer{input: input, line: 1}
20
	l.advance()
21
	if l.ch == '\uFEFF' { // start of the input
22
		l.advance()
23
	}
24
	return l
25
}
26
27
// Next returns the next token from the input.
28
// Returns EOF when input is exhausted.
29
func (l *Lexer) Next() Token {
30
	switch {
31
	case l.ch == 0:
32
		return Token{EOF, ""}
33
	case l.ch == '\n', l.ch == '\r':
34
		l.advance()
35
		return Token{NEWLINE, "\n"}
36
	case l.ch == ' ', l.ch == '\t':
37
		offset := l.pos
38
		for l.ch == ' ' || l.ch == '\t' {
39
			l.advance()
40
		}
41
		return Token{INDENT, sliceString(l.input, offset, l.pos)}
42
	case l.ch == '/':
43
		return l.lexComment()
44
	case l.ch == '"':
45
		return l.lexString()
46
	case l.ch == ':':
47
		l.advance()
48
		return Token{COLON, ":"}
49
	case l.ch == ',':
50
		l.advance()
51
		return Token{COMMA, ","}
52
	case l.ch == '[':
53
		l.advance()
54
		return Token{LBRACKET, "["}
55
	case l.ch == ']':
56
		l.advance()
57
		return Token{RBRACKET, "]"}
58
	case l.ch == '{':
59
		l.advance()
60
		return Token{LBRACE, "{"}
61
	case l.ch == '}':
62
		l.advance()
63
		return Token{RBRACE, "}"}
64
	case l.isDigit(), l.ch == '-':
65
		return l.lexNumber()
66
	case l.isAlpha():
67
		offset := l.pos
68
		for l.isAlpha() {
69
			l.advance()
70
		}
71
		lit := sliceString(l.input, offset, l.pos)
72
		kind := ILLEGAL
73
		switch lit {
74
		case "false", "true":
75
			kind = BOOL
76
		case "null":
77
			kind = NULL
78
		}
79
		return Token{kind, lit}
80
	}
81
	ch := l.ch
82
	l.advance()
83
	return Token{ILLEGAL, string(ch)}
84
}
85
86
func (l *Lexer) lexString() Token {
87
	l.advance()
88
	offset := l.pos
89
	for {
90
		switch l.ch {
91
		default:
92
			l.advance()
93
		case 0, '\r', '\n':
94
			return Token{ILLEGAL, "unterminated string"}
95
		case '\\':
96
			l.advance() // consume '\'
97
			switch l.ch {
98
			case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
99
				l.advance()
100
			case 'u':
101
				l.advance()
102
				for range 4 { // expect exactly 4 hex digits
103
					if !l.isHex() {
104
						return Token{ILLEGAL, "invalid unicode escape"}
105
					}
106
					l.advance()
107
				}
108
			default:
109
				return Token{ILLEGAL, "invalid escape sequence"}
110
			}
111
		case '"':
112
			lit := sliceString(l.input, offset, l.pos)
113
			l.advance() // consume closing '"'
114
			return Token{STRING, lit}
115
		}
116
	}
117
}
118
119
func (l *Lexer) lexNumber() Token {
120
	offset := l.pos
121
122
	if l.ch == '-' { // optional leading minus
123
		l.advance()
124
	}
125
126
	// integer part
127
	if l.ch == '0' {
128
		l.advance()
129
		if l.isDigit() { // leading zero must not be followed by another digit
130
			return Token{ILLEGAL, "leading zero in number"}
131
		}
132
	} else if l.isDigit() {
133
		for l.isDigit() {
134
			l.advance()
135
		}
136
	} else {
137
		return Token{ILLEGAL, "invalid number"}
138
	}
139
140
	kind := NUMBER
141
	if l.ch == '.' { // optional fractional part
142
		kind = DECIMAL
143
		l.advance()
144
		if !l.isDigit() {
145
			return Token{ILLEGAL, "expected digit after decimal point"}
146
		}
147
		for l.isDigit() {
148
			l.advance()
149
		}
150
	}
151
152
	if l.ch == 'e' || l.ch == 'E' { // optional exponent
153
		kind = DECIMAL
154
		l.advance()
155
		if l.ch == '+' || l.ch == '-' {
156
			l.advance()
157
		}
158
		if !l.isDigit() {
159
			return Token{ILLEGAL, "expected digit in exponent"}
160
		}
161
		for l.isDigit() {
162
			l.advance()
163
		}
164
	}
165
166
	return Token{kind, sliceString(l.input, offset, l.pos)}
167
}
168
169
func (l *Lexer) lexComment() Token {
170
	l.advance()
171
	switch l.ch {
172
	default:
173
		return Token{ILLEGAL, "invalid comment"}
174
	case '/':
175
		l.advance()
176
		offset := l.pos
177
		for l.ch != 0 && l.ch != '\n' && l.ch != '\r' {
178
			l.advance()
179
		}
180
		return Token{COMMENTLINE, sliceString(l.input, offset, l.pos)}
181
	case '*':
182
		l.advance()
183
		offset := l.pos
184
		for {
185
			if l.ch == 0 {
186
				return Token{ILLEGAL, "unterminated block comment"}
187
			}
188
			if l.ch == '*' {
189
				l.advance()
190
				if l.ch == '/' {
191
					end := l.pos - 1 // exclude the '*'
192
					l.advance()
193
					return Token{COMMENTBLOCK, sliceString(l.input, offset, end)}
194
				}
195
			} else {
196
				l.advance()
197
			}
198
		}
199
	}
200
}
201
202
func (l *Lexer) advance() {
203
	if l.rpos >= len(l.input) {
204
		l.ch = 0
205
		l.chSize = 0
206
	} else {
207
		l.ch, l.chSize = utf8.DecodeRune(l.input[l.rpos:])
208
	}
209
	l.pos = l.rpos
210
	l.rpos += l.chSize
211
	if l.ch == '\n' || l.ch == '\r' {
212
		l.line++
213
		l.col = 0
214
	} else {
215
		l.col++
216
	}
217
}
218
func (l *Lexer) isDigit() bool { return l.ch >= '0' && l.ch <= '9' }
219
func (l *Lexer) isAlpha() bool {
220
	return (l.ch >= 'a' && l.ch <= 'z') || (l.ch >= 'A' && l.ch <= 'Z')
221
}
222
223
func (l *Lexer) isHex() bool {
224
	return (l.ch >= '0' && l.ch <= '9') ||
225
		(l.ch >= 'a' && l.ch <= 'f') || (l.ch >= 'A' && l.ch <= 'F')
226
}
227
228
func sliceString(b []byte, start, end int) string {
229
	if start >= end {
230
		return ""
231
	}
232
	return unsafe.String(&b[start], end-start)
233
}