all repos

clerk @ e586ae2

missing tooling for ledger/hledger

clerk/journal/lexer/lexer.go (view raw)

Oleksandr Smirnov Oleksandr Smirnov
olexsmir@gmail.com
lexer & parser & ast..., 14 days ago
1
package lexer
2
3
import (
4
	"strings"
5
	"unicode"
6
	"unicode/utf8"
7
8
	"github.com/olexsmir/ledger-tools/journal/token"
9
)
10
11
type Mode uint
12
13
const (
14
	// start of a line, nothing consumed
15
	ModeDefault Mode = iota
16
17
	// after ; # * % ; at start of line, or anywhere inline
18
	// everything until \n is comment text
19
	ModeComment
20
21
	// after lexing a date at column 0
22
	// expects: optional status, optional code, description, comment
23
	ModeTransaction
24
25
	// after lexing an indent at start of line
26
	// expects: account name, then two spaces, then amount
27
	ModePosting
28
29
	// after ~, period expression
30
	// expects: period, optional description (after 2+ spaces), optional comment
31
	ModePeriodic
32
33
	// after =, automates transaction
34
	// expects: expression
35
	ModeAutomated
36
37
	// after a directive keyword like account, commodity, include
38
	// expects: rest of directive content
39
	ModeDirective
40
)
41
42
type Lexer struct {
43
	file  string
44
	input []byte
45
	mode  Mode
46
47
	ch     rune // current rune (0 = EOF/sentinel)
48
	chSize int  // byte size of current rune
49
	pos    int  // current byte offset (points at ch)
50
	rpos   int  // next byte offset to read (one ahead of pos)
51
	col    int  // current column (1-based)
52
	line   int  // current line (1-based)
53
54
	postingExpectAccount bool
55
}
56
57
func New(file string, input []byte) *Lexer {
58
	l := &Lexer{
59
		file:  file,
60
		input: input,
61
		line:  1,
62
	}
63
	l.advance()
64
	if l.ch == '\uFEFF' { // start of the input
65
		l.advance()
66
	}
67
	return l
68
}
69
70
// Next returns next token in the input
71
func (l *Lexer) Next() token.Token {
72
	switch l.mode {
73
	case ModeDefault:
74
		return l.lexDefault()
75
	case ModeComment:
76
		return l.lexComment()
77
	case ModeTransaction:
78
		return l.lexTransaction()
79
	case ModePosting:
80
		return l.lexPosting()
81
	case ModePeriodic:
82
		return l.lexPeriodic()
83
	case ModeAutomated:
84
		return l.lexAutomated()
85
	case ModeDirective:
86
		return l.lexDirective()
87
	}
88
	panic("unreachable")
89
}
90
91
func (l *Lexer) lexDefault() token.Token {
92
	switch {
93
	case l.ch == 0:
94
		return l.token(token.EOF, "")
95
	case l.ch == '\n':
96
		return l.lexNewline()
97
	case l.ch == '\r':
98
		l.col = 0
99
		l.advance()
100
		return l.lexNewline()
101
	case l.ch == ' ' || l.ch == '\t':
102
		tok := l.lexIndent()
103
		l.mode = ModePosting
104
		l.postingExpectAccount = true
105
		return tok
106
	case l.ch == ';' || l.ch == '#' || l.ch == '%':
107
		l.mode = ModeComment
108
		return l.lexSingle(token.SEMICOLON) // todo: ??
109
	case l.ch == '*': // * at col 0 == comment
110
		l.mode = ModeComment
111
		return l.lexSingle(token.STAR)
112
	case l.ch == '~':
113
		l.mode = ModePeriodic
114
		return l.lexSingle(token.TILDE)
115
	case l.ch == '=':
116
		l.mode = ModeAutomated
117
		return l.lexSingle(token.EQ)
118
	case l.ch == '+':
119
		return l.lexSingle(token.PLUS)
120
	case l.ch == '-':
121
		return l.lexSingle(token.MINUS)
122
	case l.ch == '.':
123
		return l.lexSingle(token.TEXT)
124
	case l.isAlpha():
125
		return l.lexKeyword()
126
	case l.isDigit():
127
		if !l.isDate() {
128
			s := l.save()
129
			for l.isDigit() || l.ch == '-' || l.ch == '/' || l.ch == '.' {
130
				l.advance()
131
			}
132
			return token.Token{Type: token.ILLEGAL, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
133
		}
134
		tok := l.lexDate()
135
		l.mode = ModeTransaction
136
		return tok
137
	default:
138
		s := l.save()
139
		l.advance()
140
		return token.Token{Type: token.ILLEGAL, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
141
	}
142
}
143
144
func (l *Lexer) lexComment() token.Token {
145
	if l.ch == '\n' || l.ch == 0 {
146
		l.mode = ModeDefault
147
		return l.lexNewline()
148
	}
149
150
	for l.ch == ' ' || l.ch == '\t' {
151
		l.lexWhitespace()
152
	}
153
154
	if l.ch == '\n' || l.ch == 0 {
155
		l.mode = ModeDefault
156
		return l.lexNewline()
157
	}
158
159
	s := l.save()
160
	for l.ch != '\n' && l.ch != 0 {
161
		l.advance()
162
	}
163
	return token.Token{Type: token.TEXT, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
164
}
165
166
func (l *Lexer) lexTransaction() token.Token {
167
	switch l.ch {
168
	case 0:
169
		return l.token(token.EOF, "")
170
	case '\n':
171
		l.mode = ModeDefault
172
		return l.lexNewline()
173
	case '\r':
174
		l.col = 0
175
		l.advance()
176
		return l.lexNewline()
177
	case ';':
178
		l.mode = ModeComment
179
		return l.lexSingle(token.SEMICOLON)
180
	case ' ', '\t':
181
		return l.lexWhitespace()
182
	case '*': // * after date = status
183
		return l.lexSingle(token.STAR)
184
	case '!':
185
		return l.lexSingle(token.BANG)
186
	case '|':
187
		return l.lexSingle(token.PIPE)
188
	case '+':
189
		return l.lexSingle(token.PLUS)
190
	case '-':
191
		return l.lexSingle(token.MINUS)
192
	case '=':
193
		return l.lexEquals()
194
	default: // description / payee
195
		if l.isDate() { // secondsry date after =
196
			return l.lexDate()
197
		}
198
		return l.lexText()
199
	}
200
}
201
202
func (l *Lexer) lexPeriodic() token.Token {
203
	switch l.ch {
204
	case 0:
205
		return l.token(token.EOF, "")
206
	case '\n':
207
		l.mode = ModeDefault
208
		return l.lexNewline()
209
	case '\r':
210
		l.col = 0
211
		l.advance()
212
		return l.lexNewline()
213
	case ';', '%', '#':
214
		l.mode = ModeComment
215
		return l.lexSingle(token.SEMICOLON) // todo: ??
216
	case ' ', '\t':
217
		return l.lexWhitespace()
218
	default:
219
		return l.lexText()
220
	}
221
}
222
223
func (l *Lexer) lexAutomated() token.Token {
224
	switch l.ch {
225
	case 0:
226
		return l.token(token.EOF, "")
227
	case '\n':
228
		l.mode = ModeDefault
229
		return l.lexNewline()
230
	case '\r':
231
		l.col = 0
232
		l.advance()
233
		return l.lexNewline()
234
	case ' ', '\t':
235
		return l.lexWhitespace()
236
	case ';', '%', '#':
237
		l.mode = ModeComment
238
		return l.lexSingle(token.SEMICOLON) // todo: ??
239
	default:
240
		return l.lexText()
241
	}
242
}
243
244
func (l *Lexer) lexPosting() token.Token {
245
	switch {
246
	case l.ch == 0:
247
		l.postingExpectAccount = false
248
		return l.token(token.EOF, "")
249
	case l.ch == '\n':
250
		l.postingExpectAccount = false
251
		l.mode = ModeDefault
252
		return l.lexNewline()
253
	case l.ch == ';':
254
		l.postingExpectAccount = false
255
		l.mode = ModeComment
256
		return l.lexSingle(token.SEMICOLON)
257
	case l.ch == ' ' || l.ch == '\t':
258
		return l.lexWhitespace()
259
	case l.postingExpectAccount && l.ch == '*':
260
		return l.lexSingle(token.STAR)
261
	case l.postingExpectAccount && l.ch == '!':
262
		return l.lexSingle(token.BANG)
263
	case l.ch == '=':
264
		return l.lexEquals()
265
	case l.ch == '@':
266
		return l.lexAt()
267
	case l.ch == '{':
268
		return l.lexLBrace()
269
	case l.ch == '}':
270
		return l.lexRBrace()
271
	case l.ch == '(':
272
		if !l.postingExpectAccount {
273
			return l.lexParenExpr()
274
		}
275
		return l.lexSingle(token.LPAREN)
276
	case l.ch == ')':
277
		return l.lexSingle(token.RPAREN)
278
	case l.ch == '[':
279
		return l.lexSingle(token.LBRACKET)
280
	case l.ch == ']':
281
		return l.lexSingle(token.RBRACKET)
282
	case l.postingExpectAccount && l.ch != '*' && l.ch != '!' && l.ch != '(' && l.ch != '[':
283
		l.postingExpectAccount = false
284
		return l.lexAccountText()
285
	case l.ch == '*': // after account name
286
		return l.lexSingle(token.STAR)
287
	case l.isDigit(), l.ch == '.':
288
		return l.lexNumber()
289
	case l.ch == '-':
290
		return l.lexSingle(token.MINUS)
291
	case l.ch == '+':
292
		return l.lexSingle(token.PLUS)
293
	case l.isCommodityStart():
294
		return l.lexCommodityMark()
295
	case l.ch >= 'a' && l.ch <= 'z':
296
		return l.lexCommodityMark()
297
	default:
298
		return l.lexAccountText()
299
	}
300
}
301
302
func (l *Lexer) lexDirective() token.Token {
303
	switch l.ch {
304
	case '\n', 0:
305
		l.mode = ModeDefault
306
		return l.lexNewline()
307
	case ';':
308
		l.mode = ModeComment
309
		return l.lexSingle(token.SEMICOLON)
310
	case ' ', '\t':
311
		return l.lexWhitespace()
312
	case '=':
313
		return l.lexSingle(token.EQ)
314
	case '+':
315
		return l.lexSingle(token.PLUS)
316
	case '-':
317
		return l.lexSingle(token.MINUS)
318
	case '.':
319
		return l.lexSingle(token.TEXT)
320
	default:
321
		if l.isCommodityStart() {
322
			return l.lexCommodityMark()
323
		}
324
		if l.isTime() {
325
			return l.lexTime()
326
		}
327
		if l.isDate() {
328
			return l.lexDate()
329
		}
330
		if l.isDigit() {
331
			return l.lexNumber()
332
		}
333
		return l.lexText()
334
	}
335
	// case l.ch == '/': // regex in 'alias'
336
	// 	return l.lexSingle(token.SLASH)
337
}
338
339
func (l *Lexer) lexSingle(kind token.Type) token.Token {
340
	s := l.save()
341
	l.advance()
342
	return token.Token{
343
		Type:    kind,
344
		Literal: string(l.input[s.offset:l.pos]),
345
		Span:    l.span(s),
346
	}
347
}
348
349
func (l *Lexer) lexNewline() token.Token {
350
	s := l.save()
351
	l.advance()
352
	l.mode = ModeDefault
353
	return token.Token{Type: token.NEWLINE, Literal: "\n", Span: l.span(s)}
354
}
355
356
func (l *Lexer) lexWhitespace() token.Token {
357
	s := l.save()
358
	for l.ch == ' ' || l.ch == '\t' {
359
		l.advance()
360
	}
361
	lit := string(l.input[s.offset:l.pos])
362
	return token.Token{Type: token.WHITESPACE, Literal: lit, Span: l.span(s)}
363
}
364
365
func (l *Lexer) lexIndent() token.Token {
366
	s := l.save()
367
	for l.ch == ' ' || l.ch == '\t' {
368
		l.advance()
369
	}
370
	lit := string(l.input[s.offset:l.pos])
371
	return token.Token{Type: token.INDENT, Literal: lit, Span: l.span(s)}
372
}
373
374
func (l *Lexer) lexEquals() token.Token {
375
	s := l.save()
376
	l.advance()
377
	if l.ch == '=' {
378
		l.advance()
379
		switch l.ch {
380
		case '=':
381
			l.advance()
382
			return token.Token{Type: token.EQEQEQ, Literal: "===", Span: l.span(s)}
383
		case '*':
384
			l.advance()
385
			return token.Token{Type: token.EQEQEQ, Literal: "==*", Span: l.span(s)}
386
		default:
387
			return token.Token{Type: token.EQEQ, Literal: "==", Span: l.span(s)}
388
		}
389
	}
390
	return token.Token{Type: token.EQ, Literal: "=", Span: l.span(s)}
391
}
392
393
func (l *Lexer) lexAt() token.Token {
394
	s := l.save()
395
	l.advance()
396
	if l.ch == '@' {
397
		l.advance()
398
		return token.Token{Type: token.ATAT, Literal: "@@", Span: l.span(s)}
399
	}
400
	return token.Token{Type: token.AT, Literal: "@", Span: l.span(s)}
401
}
402
403
func (l *Lexer) lexText() token.Token {
404
	s := l.save()
405
	l.advance()
406
	for l.ch != '\n' && l.ch != ';' && l.ch != 0 && l.ch != ' ' && l.ch != '\t' {
407
		l.advance()
408
	}
409
	lit := string(l.input[s.offset:l.pos])
410
	return token.Token{Type: token.TEXT, Literal: lit, Span: l.span(s)}
411
}
412
413
func (l *Lexer) lexAccountText() token.Token {
414
	s := l.save()
415
	for l.ch != '\n' && l.ch != ';' && l.ch != 0 && l.ch != ')' && l.ch != ']' {
416
		// two spaces = end of account name
417
		if l.isTwoSpaces() {
418
			break
419
		}
420
		l.advance()
421
	}
422
	lit := string(l.input[s.offset:l.pos])
423
	return token.Token{Type: token.TEXT, Literal: lit, Span: l.span(s)}
424
}
425
426
func (l *Lexer) lexParenExpr() token.Token {
427
	s := l.save()
428
	depth := 0
429
	for l.ch != '\n' && l.ch != 0 {
430
		if l.ch == '(' {
431
			depth++
432
		} else if l.ch == ')' {
433
			depth--
434
			if depth == 0 {
435
				l.advance()
436
				break
437
			}
438
		}
439
		l.advance()
440
	}
441
	lit := string(l.input[s.offset:l.pos])
442
	return token.Token{Type: token.PARENEXPR, Literal: lit, Span: l.span(s)}
443
}
444
445
func (l *Lexer) lexNumber() token.Token {
446
	s := l.save()
447
	for l.isDigit() || l.ch == '.' || l.ch == ',' || l.ch == '_' {
448
		l.advance()
449
	}
450
	lit := string(l.input[s.offset:l.pos])
451
	kind := token.INT
452
	if strings.ContainsAny(lit, ".,") {
453
		kind = token.DECIMAL
454
	}
455
	return token.Token{Type: kind, Literal: lit, Span: l.span(s)}
456
}
457
458
func (l *Lexer) lexKeyword() token.Token {
459
	s := l.save()
460
	for l.ch != 0 && l.ch != '\n' && l.ch != '\r' && l.ch != ' ' && l.ch != '\t' && l.ch != ';' {
461
		l.advance()
462
	}
463
	lit := string(l.input[s.offset:l.pos])
464
	kind := l.keyword(lit)
465
	if kind == token.ILLEGAL { // todo: report an error ??
466
		kind = token.TEXT
467
	} else {
468
		l.mode = ModeDirective
469
	}
470
	return token.Token{Type: kind, Literal: lit, Span: l.span(s)}
471
}
472
473
func (l *Lexer) lexDate() token.Token {
474
	s := l.save()
475
	for l.isDigit() || (l.isDateSep() && l.peekIsDigit()) {
476
		l.advance()
477
	}
478
	return token.Token{Type: token.DATE, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
479
}
480
481
func (l *Lexer) lexCommodityMark() token.Token {
482
	s := l.save()
483
484
	if l.ch == '"' {
485
		l.advance()
486
		for l.ch != '"' && l.ch != '\n' && l.ch != 0 {
487
			l.advance()
488
		}
489
		if l.ch == '"' {
490
			l.advance()
491
		}
492
		return token.Token{Type: token.COMMODITYMARK, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
493
	}
494
495
	if unicode.IsLetter(l.ch) {
496
		for unicode.IsLetter(l.ch) || unicode.IsDigit(l.ch) {
497
			l.advance()
498
		}
499
		return token.Token{Type: token.COMMODITYMARK, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
500
	}
501
502
	l.advance()
503
	return token.Token{Type: token.COMMODITYMARK, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
504
}
505
506
func (l *Lexer) lexLBrace() token.Token {
507
	s := l.save()
508
	l.advance()
509
	if l.ch == '{' {
510
		l.advance()
511
		return token.Token{Type: token.LBRACELBRACE, Literal: "{{", Span: l.span(s)}
512
	}
513
	return token.Token{Type: token.LBRACE, Literal: "{", Span: l.span(s)}
514
}
515
516
func (l *Lexer) lexRBrace() token.Token {
517
	s := l.save()
518
	l.advance()
519
	if l.ch == '}' {
520
		l.advance()
521
		return token.Token{Type: token.RBRACERBRACE, Literal: "}}", Span: l.span(s)}
522
	}
523
	return token.Token{Type: token.RBRACE, Literal: "}", Span: l.span(s)}
524
}
525
526
func (l *Lexer) advance() {
527
	if l.rpos >= len(l.input) {
528
		l.ch = 0
529
		l.chSize = 0
530
	} else {
531
		r, size := utf8.DecodeRune(l.input[l.rpos:])
532
		l.ch = r
533
		l.chSize = size
534
	}
535
	l.pos = l.rpos
536
	l.rpos += l.chSize
537
	if l.ch == '\n' || l.ch == '\r' {
538
		l.line++
539
		l.col = 0
540
	} else {
541
		l.col++
542
	}
543
}
544
545
func (l *Lexer) peek() rune {
546
	r, _ := utf8.DecodeRune(l.input[l.rpos:])
547
	return r
548
}
549
550
func (l *Lexer) peekN(n int) byte {
551
	if l.pos+n >= len(l.input) {
552
		return 0
553
	}
554
	return l.input[l.pos+n]
555
}
556
557
func (l *Lexer) isDigit() bool { return l.ch >= '0' && l.ch <= '9' }
558
func (l *Lexer) isAlpha() bool {
559
	return (l.ch >= 'a' && l.ch <= 'z') ||
560
		(l.ch >= 'A' && l.ch <= 'Z')
561
}
562
563
func (l *Lexer) isTwoSpaces() bool { return l.ch == ' ' && l.peek() == ' ' }
564
565
func (l *Lexer) isDateSep() bool { return l.ch == '-' || l.ch == '/' || l.ch == '.' }
566
567
func (l *Lexer) peekIsDigit() bool {
568
	r := l.peek()
569
	return r >= '0' && r <= '9'
570
}
571
572
func (l *Lexer) isCommodityStart() bool {
573
	if l.ch == '$' || (l.ch >= 'A' && l.ch <= 'Z') {
574
		return true
575
	}
576
	if l.ch < utf8.RuneSelf {
577
		return false
578
	}
579
	return unicode.In(l.ch, unicode.Sc) || unicode.IsLetter(l.ch)
580
}
581
582
func (l *Lexer) isDate() bool {
583
	if !l.isDigit() {
584
		return false
585
	}
586
	// YYYY/M/D or YYYY/MM/DD
587
	if l.peekN(1) >= '0' && l.peekN(1) <= '9' &&
588
		l.peekN(2) >= '0' && l.peekN(2) <= '9' &&
589
		l.peekN(3) >= '0' && l.peekN(3) <= '9' {
590
		sep := l.peekN(4)
591
		if sep == '/' || sep == '-' || sep == '.' {
592
			if l.peekN(5) >= '0' && l.peekN(5) <= '9' {
593
				if l.peekN(6) == sep {
594
					return l.peekN(7) >= '0' && l.peekN(7) <= '9'
595
				}
596
				if l.peekN(7) == sep {
597
					return l.peekN(8) >= '0' && l.peekN(8) <= '9'
598
				}
599
			}
600
		}
601
		return false
602
	}
603
	// M/D or MM/DD(year inferred, only / and - separators; . is ambiguous with decimal numbers like 1.01)
604
	if (l.peekN(1) == '/' || l.peekN(1) == '-') &&
605
		l.peekN(2) >= '0' && l.peekN(2) <= '9' &&
606
		l.ch >= '1' && l.ch <= '9' {
607
		return validDay(l.peekN(2), l.peekN(3))
608
	}
609
	if (l.peekN(2) == '/' || l.peekN(2) == '-') &&
610
		l.peekN(3) >= '0' && l.peekN(3) <= '9' {
611
		m := int(l.ch-'0')*10 + int(l.peekN(1)-'0')
612
		return m >= 1 && m <= 12 && validDay(l.peekN(3), l.peekN(4))
613
	}
614
	return false
615
}
616
617
func validDay(first, second byte) bool {
618
	d := int(first - '0')
619
	if second >= '0' && second <= '9' {
620
		d = d*10 + int(second-'0')
621
	}
622
	return d >= 1 && d <= 31
623
}
624
625
func (l *Lexer) isTime() bool {
626
	if !l.isDigit() {
627
		return false
628
	}
629
	return l.peekN(2) == ':'
630
}
631
632
func (l *Lexer) lexTime() token.Token {
633
	s := l.save()
634
	for l.isDigit() || l.ch == ':' {
635
		l.advance()
636
	}
637
	return token.Token{Type: token.TIME, Literal: string(l.input[s.offset:l.pos]), Span: l.span(s)}
638
}
639
640
type savedPos struct{ offset, line, col int }
641
642
func (l *Lexer) save() savedPos {
643
	return savedPos{l.pos, l.line, l.col}
644
}
645
646
func (l *Lexer) span(s savedPos) token.Span {
647
	return token.Span{
648
		Start: token.Pos{File: l.file, Offset: s.offset, Line: s.line, Col: s.col},
649
		End:   token.Pos{File: l.file, Offset: l.pos, Line: l.line, Col: l.col},
650
	}
651
}
652
653
func (l *Lexer) token(kind token.Type, literal string) token.Token {
654
	s := savedPos{l.pos, l.line, l.col}
655
	return token.Token{Type: kind, Literal: literal, Span: l.span(s)}
656
}
657
658
func (l *Lexer) keyword(s string) token.Type {
659
	switch s {
660
	case "comment":
661
		return token.COMMENTKW
662
	case "account":
663
		return token.ACCOUNT
664
	case "commodity":
665
		return token.COMMODITY
666
	case "include":
667
		return token.INCLUDE
668
	case "alias":
669
		return token.ALIAS
670
	case "payee":
671
		return token.PAYEE
672
	case "tag":
673
		return token.TAG
674
	case "apply":
675
		return token.APPLY
676
	case "end":
677
		return token.END
678
	case "Y", "year":
679
		return token.YEAR
680
	case "decimal-mark":
681
		return token.DECIMALMARK
682
	case "D":
683
		return token.D
684
	case "P":
685
		return token.P
686
	case "N":
687
		return token.N
688
	default:
689
		return token.ILLEGAL
690
	}
691
}