json2go: lexer.go (v0.2.0)

1

package json2go

2

3

import (

4

	"unicode/utf8"

5

	"unsafe"

6

7

8

type Lexer struct {

9

	input  []byte

10

	ch     rune // current rune (0 == EOF)

11

	chSize int  // byte size of [ch]

12

	pos    int  // current byte offset (points at [ch])

13

	rpos   int  // next byte offset to read (one ahead of [pos])

14

	col    int  // current column (1-based)

15

	line   int  // current line (1-based)

16

17

18

func NewLexer(input []byte) *Lexer {

19

	l := &Lexer{input: input, line: 1}

20

	l.advance()

21

	if l.ch == '\uFEFF' { // start of the input

22

		l.advance()

23

24

	return l

25

26

27

// Next returns the next token from the input.

28

// Returns EOF when input is exhausted.

29

func (l *Lexer) Next() Token {

30

	switch {

31

	case l.ch == 0:

32

		return Token{EOF, ""}

33

	case l.ch == '\n', l.ch == '\r':

34

		l.advance()

35

		return Token{NEWLINE, "\n"}

36

	case l.ch == ' ', l.ch == '\t':

37

		offset := l.pos

38

		for l.ch == ' ' || l.ch == '\t' {

39

			l.advance()

40

41

		return Token{INDENT, sliceString(l.input, offset, l.pos)}

42

	case l.ch == '/':

43

		return l.lexComment()

44

	case l.ch == '"':

45

		return l.lexString()

46

	case l.ch == ':':

47

		l.advance()

48

		return Token{COLON, ":"}

49

	case l.ch == ',':

50

		l.advance()

51

		return Token{COMMA, ","}

52

	case l.ch == '[':

53

		l.advance()

54

		return Token{LBRACKET, "["}

55

	case l.ch == ']':

56

		l.advance()

57

		return Token{RBRACKET, "]"}

58

	case l.ch == '{':

59

		l.advance()

60

		return Token{LBRACE, "{"}

61

	case l.ch == '}':

62

		l.advance()

63

		return Token{RBRACE, "}"}

64

	case l.isDigit(), l.ch == '-':

65

		return l.lexNumber()

66

	case l.isAlpha():

67

		offset := l.pos

68

		for l.isAlpha() {

69

			l.advance()

70

71

		lit := sliceString(l.input, offset, l.pos)

72

		kind := ILLEGAL

73

		switch lit {

74

		case "false", "true":

75

			kind = BOOL

76

		case "null":

77

			kind = NULL

78

79

		return Token{kind, lit}

80

81

	ch := l.ch

82

	l.advance()

83

	return Token{ILLEGAL, string(ch)}

84

85

86

func (l *Lexer) lexString() Token {

87

	l.advance()

88

	offset := l.pos

89

	for {

90

		switch l.ch {

91

		default:

92

			l.advance()

93

		case 0, '\r', '\n':

94

			return Token{ILLEGAL, "unterminated string"}

95

		case '\\':

96

			l.advance() // consume '\'

97

			switch l.ch {

98

			case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':

99

				l.advance()

100

			case 'u':

101

				l.advance()

102

				for range 4 { // expect exactly 4 hex digits

103

					if !l.isHex() {

104

						return Token{ILLEGAL, "invalid unicode escape"}

105

106

					l.advance()

107

108

			default:

109

				return Token{ILLEGAL, "invalid escape sequence"}

110

111

		case '"':

112

			lit := sliceString(l.input, offset, l.pos)

113

			l.advance() // consume closing '"'

114

			return Token{STRING, lit}

115

116

117

118

119

func (l *Lexer) lexNumber() Token {

120

	offset := l.pos

121

122

	if l.ch == '-' { // optional leading minus

123

		l.advance()

124

125

126

	// integer part

127

	if l.ch == '0' {

128

		l.advance()

129

		if l.isDigit() { // leading zero must not be followed by another digit

130

			return Token{ILLEGAL, "leading zero in number"}

131

132

	} else if l.isDigit() {

133

		for l.isDigit() {

134

			l.advance()

135

136

	} else {

137

		return Token{ILLEGAL, "invalid number"}

138

139

140

	kind := NUMBER

141

	if l.ch == '.' { // optional fractional part

142

		kind = DECIMAL

143

		l.advance()

144

		if !l.isDigit() {

145

			return Token{ILLEGAL, "expected digit after decimal point"}

146

147

		for l.isDigit() {

148

			l.advance()

149

150

151

152

	if l.ch == 'e' || l.ch == 'E' { // optional exponent

153

		kind = DECIMAL

154

		l.advance()

155

		if l.ch == '+' || l.ch == '-' {

156

			l.advance()

157

158

		if !l.isDigit() {

159

			return Token{ILLEGAL, "expected digit in exponent"}

160

161

		for l.isDigit() {

162

			l.advance()

163

164

165

166

	return Token{kind, sliceString(l.input, offset, l.pos)}

167

168

169

func (l *Lexer) lexComment() Token {

170

	l.advance()

171

	switch l.ch {

172

	default:

173

		return Token{ILLEGAL, "invalid comment"}

174

	case '/':

175

		l.advance()

176

		offset := l.pos

177

		for l.ch != 0 && l.ch != '\n' && l.ch != '\r' {

178

			l.advance()

179

180

		return Token{COMMENTLINE, sliceString(l.input, offset, l.pos)}

181

	case '*':

182

		l.advance()

183

		offset := l.pos

184

		for {

185

			if l.ch == 0 {

186

				return Token{ILLEGAL, "unterminated block comment"}

187

188

			if l.ch == '*' {

189

				l.advance()

190

				if l.ch == '/' {

191

					end := l.pos - 1 // exclude the '*'

192

					l.advance()

193

					return Token{COMMENTBLOCK, sliceString(l.input, offset, end)}

194

195

			} else {

196

				l.advance()

197

198

199

200

201

202

func (l *Lexer) advance() {

203

	if l.rpos >= len(l.input) {

204

		l.ch = 0

205

		l.chSize = 0

206

	} else {

207

		l.ch, l.chSize = utf8.DecodeRune(l.input[l.rpos:])

208

209

	l.pos = l.rpos

210

	l.rpos += l.chSize

211

	if l.ch == '\n' || l.ch == '\r' {

212

		l.line++

213

		l.col = 0

214

	} else {

215

		l.col++

216

217

218

func (l *Lexer) isDigit() bool { return l.ch >= '0' && l.ch <= '9' }

219

func (l *Lexer) isAlpha() bool {

220

	return (l.ch >= 'a' && l.ch <= 'z') || (l.ch >= 'A' && l.ch <= 'Z')

221

222

223

func (l *Lexer) isHex() bool {

224

	return (l.ch >= '0' && l.ch <= '9') ||

225

		(l.ch >= 'a' && l.ch <= 'f') || (l.ch >= 'A' && l.ch <= 'F')

226

227

228

func sliceString(b []byte, start, end int) string {

229

	if start >= end {

230

		return ""

231

232

	return unsafe.String(&b[start], end-start)

233

1	package json2go
2
3	import (
4	"unicode/utf8"
5	"unsafe"
6	)
7
8	type Lexer struct {
9	input []byte
10	ch rune // current rune (0 == EOF)
11	chSize int // byte size of [ch]
12	pos int // current byte offset (points at [ch])
13	rpos int // next byte offset to read (one ahead of [pos])
14	col int // current column (1-based)
15	line int // current line (1-based)
16	}
17
18	func NewLexer(input []byte) *Lexer {
19	l := &Lexer{input: input, line: 1}
20	l.advance()
21	if l.ch == '\uFEFF' { // start of the input
22	l.advance()
23	}
24	return l
25	}
26
27	// Next returns the next token from the input.
28	// Returns EOF when input is exhausted.
29	func (l *Lexer) Next() Token {
30	switch {
31	case l.ch == 0:
32	return Token{EOF, ""}
33	case l.ch == '\n', l.ch == '\r':
34	l.advance()
35	return Token{NEWLINE, "\n"}
36	case l.ch == ' ', l.ch == '\t':
37	offset := l.pos
38	for l.ch == ' ' \|\| l.ch == '\t' {
39	l.advance()
40	}
41	return Token{INDENT, sliceString(l.input, offset, l.pos)}
42	case l.ch == '/':
43	return l.lexComment()
44	case l.ch == '"':
45	return l.lexString()
46	case l.ch == ':':
47	l.advance()
48	return Token{COLON, ":"}
49	case l.ch == ',':
50	l.advance()
51	return Token{COMMA, ","}
52	case l.ch == '[':
53	l.advance()
54	return Token{LBRACKET, "["}
55	case l.ch == ']':
56	l.advance()
57	return Token{RBRACKET, "]"}
58	case l.ch == '{':
59	l.advance()
60	return Token{LBRACE, "{"}
61	case l.ch == '}':
62	l.advance()
63	return Token{RBRACE, "}"}
64	case l.isDigit(), l.ch == '-':
65	return l.lexNumber()
66	case l.isAlpha():
67	offset := l.pos
68	for l.isAlpha() {
69	l.advance()
70	}
71	lit := sliceString(l.input, offset, l.pos)
72	kind := ILLEGAL
73	switch lit {
74	case "false", "true":
75	kind = BOOL
76	case "null":
77	kind = NULL
78	}
79	return Token{kind, lit}
80	}
81	ch := l.ch
82	l.advance()
83	return Token{ILLEGAL, string(ch)}
84	}
85
86	func (l *Lexer) lexString() Token {
87	l.advance()
88	offset := l.pos
89	for {
90	switch l.ch {
91	default:
92	l.advance()
93	case 0, '\r', '\n':
94	return Token{ILLEGAL, "unterminated string"}
95	case '\\':
96	l.advance() // consume '\'
97	switch l.ch {
98	case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
99	l.advance()
100	case 'u':
101	l.advance()
102	for range 4 { // expect exactly 4 hex digits
103	if !l.isHex() {
104	return Token{ILLEGAL, "invalid unicode escape"}
105	}
106	l.advance()
107	}
108	default:
109	return Token{ILLEGAL, "invalid escape sequence"}
110	}
111	case '"':
112	lit := sliceString(l.input, offset, l.pos)
113	l.advance() // consume closing '"'
114	return Token{STRING, lit}
115	}
116	}
117	}
118
119	func (l *Lexer) lexNumber() Token {
120	offset := l.pos
121
122	if l.ch == '-' { // optional leading minus
123	l.advance()
124	}
125
126	// integer part
127	if l.ch == '0' {
128	l.advance()
129	if l.isDigit() { // leading zero must not be followed by another digit
130	return Token{ILLEGAL, "leading zero in number"}
131	}
132	} else if l.isDigit() {
133	for l.isDigit() {
134	l.advance()
135	}
136	} else {
137	return Token{ILLEGAL, "invalid number"}
138	}
139
140	kind := NUMBER
141	if l.ch == '.' { // optional fractional part
142	kind = DECIMAL
143	l.advance()
144	if !l.isDigit() {
145	return Token{ILLEGAL, "expected digit after decimal point"}
146	}
147	for l.isDigit() {
148	l.advance()
149	}
150	}
151
152	if l.ch == 'e' \|\| l.ch == 'E' { // optional exponent
153	kind = DECIMAL
154	l.advance()
155	if l.ch == '+' \|\| l.ch == '-' {
156	l.advance()
157	}
158	if !l.isDigit() {
159	return Token{ILLEGAL, "expected digit in exponent"}
160	}
161	for l.isDigit() {
162	l.advance()
163	}
164	}
165
166	return Token{kind, sliceString(l.input, offset, l.pos)}
167	}
168
169	func (l *Lexer) lexComment() Token {
170	l.advance()
171	switch l.ch {
172	default:
173	return Token{ILLEGAL, "invalid comment"}
174	case '/':
175	l.advance()
176	offset := l.pos
177	for l.ch != 0 && l.ch != '\n' && l.ch != '\r' {
178	l.advance()
179	}
180	return Token{COMMENTLINE, sliceString(l.input, offset, l.pos)}
181	case '*':
182	l.advance()
183	offset := l.pos
184	for {
185	if l.ch == 0 {
186	return Token{ILLEGAL, "unterminated block comment"}
187	}
188	if l.ch == '*' {
189	l.advance()
190	if l.ch == '/' {
191	end := l.pos - 1 // exclude the '*'
192	l.advance()
193	return Token{COMMENTBLOCK, sliceString(l.input, offset, end)}
194	}
195	} else {
196	l.advance()
197	}
198	}
199	}
200	}
201
202	func (l *Lexer) advance() {
203	if l.rpos >= len(l.input) {
204	l.ch = 0
205	l.chSize = 0
206	} else {
207	l.ch, l.chSize = utf8.DecodeRune(l.input[l.rpos:])
208	}
209	l.pos = l.rpos
210	l.rpos += l.chSize
211	if l.ch == '\n' \|\| l.ch == '\r' {
212	l.line++
213	l.col = 0
214	} else {
215	l.col++
216	}
217	}
218	func (l *Lexer) isDigit() bool { return l.ch >= '0' && l.ch <= '9' }
219	func (l *Lexer) isAlpha() bool {
220	return (l.ch >= 'a' && l.ch <= 'z') \|\| (l.ch >= 'A' && l.ch <= 'Z')
221	}
222
223	func (l *Lexer) isHex() bool {
224	return (l.ch >= '0' && l.ch <= '9') \|\|
225	(l.ch >= 'a' && l.ch <= 'f') \|\| (l.ch >= 'A' && l.ch <= 'F')
226	}
227
228	func sliceString(b []byte, start, end int) string {
229	if start >= end {
230	return ""
231	}
232	return unsafe.String(&b[start], end-start)
233	}