rss-tools: vendor/golang.org/x/text/internal/language/parse.go (a5ac52722b131734c74504b6e6f4d9900536cac7)

1

// Copyright 2013 The Go Authors. All rights reserved.

2

// Use of this source code is governed by a BSD-style

3

// license that can be found in the LICENSE file.

4

5

package language

6

7

import (

8

	"bytes"

9

	"errors"

10

	"fmt"

11

	"sort"

12

13

	"golang.org/x/text/internal/tag"

14

15

16

// isAlpha returns true if the byte is not a digit.

17

// b must be an ASCII letter or digit.

18

func isAlpha(b byte) bool {

19

	return b > '9'

20

21

22

// isAlphaNum returns true if the string contains only ASCII letters or digits.

23

func isAlphaNum(s []byte) bool {

24

	for _, c := range s {

25

		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {

26

			return false

27

28

29

	return true

30

31

32

// ErrSyntax is returned by any of the parsing functions when the

33

// input is not well-formed, according to BCP 47.

34

// TODO: return the position at which the syntax error occurred?

35

var ErrSyntax = errors.New("language: tag is not well-formed")

36

37

// ErrDuplicateKey is returned when a tag contains the same key twice with

38

// different values in the -u section.

39

var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")

40

41

// ValueError is returned by any of the parsing functions when the

42

// input is well-formed but the respective subtag is not recognized

43

// as a valid value.

44

type ValueError struct {

45

	v [8]byte

46

47

48

// NewValueError creates a new ValueError.

49

func NewValueError(tag []byte) ValueError {

50

	var e ValueError

51

	copy(e.v[:], tag)

52

	return e

53

54

55

func (e ValueError) tag() []byte {

56

	n := bytes.IndexByte(e.v[:], 0)

57

	if n == -1 {

58

		n = 8

59

60

	return e.v[:n]

61

62

63

// Error implements the error interface.

64

func (e ValueError) Error() string {

65

	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())

66

67

68

// Subtag returns the subtag for which the error occurred.

69

func (e ValueError) Subtag() string {

70

	return string(e.tag())

71

72

73

// scanner is used to scan BCP 47 tokens, which are separated by _ or -.

74

type scanner struct {

75

	b     []byte

76

	bytes [max99thPercentileSize]byte

77

	token []byte

78

	start int // start position of the current token

79

	end   int // end position of the current token

80

	next  int // next point for scan

81

	err   error

82

	done  bool

83

84

85

func makeScannerString(s string) scanner {

86

	scan := scanner{}

87

	if len(s) <= len(scan.bytes) {

88

		scan.b = scan.bytes[:copy(scan.bytes[:], s)]

89

	} else {

90

		scan.b = []byte(s)

91

92

	scan.init()

93

	return scan

94

95

96

// makeScanner returns a scanner using b as the input buffer.

97

// b is not copied and may be modified by the scanner routines.

98

func makeScanner(b []byte) scanner {

99

	scan := scanner{b: b}

100

	scan.init()

101

	return scan

102

103

104

func (s *scanner) init() {

105

	for i, c := range s.b {

106

		if c == '_' {

107

			s.b[i] = '-'

108

109

110

	s.scan()

111

112

113

// restToLower converts the string between start and end to lower case.

114

func (s *scanner) toLower(start, end int) {

115

	for i := start; i < end; i++ {

116

		c := s.b[i]

117

		if 'A' <= c && c <= 'Z' {

118

			s.b[i] += 'a' - 'A'

119

120

121

122

123

func (s *scanner) setError(e error) {

124

	if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {

125

		s.err = e

126

127

128

129

// resizeRange shrinks or grows the array at position oldStart such that

130

// a new string of size newSize can fit between oldStart and oldEnd.

131

// Sets the scan point to after the resized range.

132

func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {

133

	s.start = oldStart

134

	if end := oldStart + newSize; end != oldEnd {

135

		diff := end - oldEnd

136

		var b []byte

137

		if n := len(s.b) + diff; n > cap(s.b) {

138

			b = make([]byte, n)

139

			copy(b, s.b[:oldStart])

140

		} else {

141

			b = s.b[:n]

142

143

		copy(b[end:], s.b[oldEnd:])

144

		s.b = b

145

		s.next = end + (s.next - s.end)

146

		s.end = end

147

148

149

150

// replace replaces the current token with repl.

151

func (s *scanner) replace(repl string) {

152

	s.resizeRange(s.start, s.end, len(repl))

153

	copy(s.b[s.start:], repl)

154

155

156

// gobble removes the current token from the input.

157

// Caller must call scan after calling gobble.

158

func (s *scanner) gobble(e error) {

159

	s.setError(e)

160

	if s.start == 0 {

161

		s.b = s.b[:+copy(s.b, s.b[s.next:])]

162

		s.end = 0

163

	} else {

164

		s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]

165

		s.end = s.start - 1

166

167

	s.next = s.start

168

169

170

// deleteRange removes the given range from s.b before the current token.

171

func (s *scanner) deleteRange(start, end int) {

172

	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]

173

	diff := end - start

174

	s.next -= diff

175

	s.start -= diff

176

	s.end -= diff

177

178

179

// scan parses the next token of a BCP 47 string.  Tokens that are larger

180

// than 8 characters or include non-alphanumeric characters result in an error

181

// and are gobbled and removed from the output.

182

// It returns the end position of the last token consumed.

183

func (s *scanner) scan() (end int) {

184

	end = s.end

185

	s.token = nil

186

	for s.start = s.next; s.next < len(s.b); {

187

		i := bytes.IndexByte(s.b[s.next:], '-')

188

		if i == -1 {

189

			s.end = len(s.b)

190

			s.next = len(s.b)

191

			i = s.end - s.start

192

		} else {

193

			s.end = s.next + i

194

			s.next = s.end + 1

195

196

		token := s.b[s.start:s.end]

197

		if i < 1 || i > 8 || !isAlphaNum(token) {

198

			s.gobble(ErrSyntax)

199

			continue

200

201

		s.token = token

202

		return end

203

204

	if n := len(s.b); n > 0 && s.b[n-1] == '-' {

205

		s.setError(ErrSyntax)

206

		s.b = s.b[:len(s.b)-1]

207

208

	s.done = true

209

	return end

210

211

212

// acceptMinSize parses multiple tokens of the given size or greater.

213

// It returns the end position of the last token consumed.

214

func (s *scanner) acceptMinSize(min int) (end int) {

215

	end = s.end

216

	s.scan()

217

	for ; len(s.token) >= min; s.scan() {

218

		end = s.end

219

220

	return end

221

222

223

// Parse parses the given BCP 47 string and returns a valid Tag. If parsing

224

// failed it returns an error and any part of the tag that could be parsed.

225

// If parsing succeeded but an unknown value was found, it returns

226

// ValueError. The Tag returned in this case is just stripped of the unknown

227

// value. All other values are preserved. It accepts tags in the BCP 47 format

228

// and extensions to this standard defined in

229

// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.

230

func Parse(s string) (t Tag, err error) {

231

	// TODO: consider supporting old-style locale key-value pairs.

232

	if s == "" {

233

		return Und, ErrSyntax

234

235

	defer func() {

236

		if recover() != nil {

237

			t = Und

238

			err = ErrSyntax

239

			return

240

241

}()

242

	if len(s) <= maxAltTaglen {

243

		b := [maxAltTaglen]byte{}

244

		for i, c := range s {

245

			// Generating invalid UTF-8 is okay as it won't match.

246

			if 'A' <= c && c <= 'Z' {

247

				c += 'a' - 'A'

248

			} else if c == '_' {

249

				c = '-'

250

251

			b[i] = byte(c)

252

253

		if t, ok := grandfathered(b); ok {

254

			return t, nil

255

256

257

	scan := makeScannerString(s)

258

	return parse(&scan, s)

259

260

261

func parse(scan *scanner, s string) (t Tag, err error) {

262

	t = Und

263

	var end int

264

	if n := len(scan.token); n <= 1 {

265

		scan.toLower(0, len(scan.b))

266

		if n == 0 || scan.token[0] != 'x' {

267

			return t, ErrSyntax

268

269

		end = parseExtensions(scan)

270

	} else if n >= 4 {

271

		return Und, ErrSyntax

272

	} else { // the usual case

273

		t, end = parseTag(scan, true)

274

		if n := len(scan.token); n == 1 {

275

			t.pExt = uint16(end)

276

			end = parseExtensions(scan)

277

		} else if end < len(scan.b) {

278

			scan.setError(ErrSyntax)

279

			scan.b = scan.b[:end]

280

281

282

	if int(t.pVariant) < len(scan.b) {

283

		if end < len(s) {

284

			s = s[:end]

285

286

		if len(s) > 0 && tag.Compare(s, scan.b) == 0 {

287

			t.str = s

288

		} else {

289

			t.str = string(scan.b)

290

291

	} else {

292

		t.pVariant, t.pExt = 0, 0

293

294

	return t, scan.err

295

296

297

// parseTag parses language, script, region and variants.

298

// It returns a Tag and the end position in the input that was parsed.

299

// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.

300

func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {

301

	var e error

302

	// TODO: set an error if an unknown lang, script or region is encountered.

303

	t.LangID, e = getLangID(scan.token)

304

	scan.setError(e)

305

	scan.replace(t.LangID.String())

306

	langStart := scan.start

307

	end = scan.scan()

308

	for len(scan.token) == 3 && isAlpha(scan.token[0]) {

309

		// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent

310

		// to a tag of the form <extlang>.

311

		if doNorm {

312

			lang, e := getLangID(scan.token)

313

			if lang != 0 {

314

				t.LangID = lang

315

				langStr := lang.String()

316

				copy(scan.b[langStart:], langStr)

317

				scan.b[langStart+len(langStr)] = '-'

318

				scan.start = langStart + len(langStr) + 1

319

320

			scan.gobble(e)

321

322

		end = scan.scan()

323

324

	if len(scan.token) == 4 && isAlpha(scan.token[0]) {

325

		t.ScriptID, e = getScriptID(script, scan.token)

326

		if t.ScriptID == 0 {

327

			scan.gobble(e)

328

329

		end = scan.scan()

330

331

	if n := len(scan.token); n >= 2 && n <= 3 {

332

		t.RegionID, e = getRegionID(scan.token)

333

		if t.RegionID == 0 {

334

			scan.gobble(e)

335

		} else {

336

			scan.replace(t.RegionID.String())

337

338

		end = scan.scan()

339

340

	scan.toLower(scan.start, len(scan.b))

341

	t.pVariant = byte(end)

342

	end = parseVariants(scan, end, t)

343

	t.pExt = uint16(end)

344

	return t, end

345

346

347

var separator = []byte{'-'}

348

349

// parseVariants scans tokens as long as each token is a valid variant string.

350

// Duplicate variants are removed.

351

func parseVariants(scan *scanner, end int, t Tag) int {

352

	start := scan.start

353

	varIDBuf := [4]uint8{}

354

	variantBuf := [4][]byte{}

355

	varID := varIDBuf[:0]

356

	variant := variantBuf[:0]

357

	last := -1

358

	needSort := false

359

	for ; len(scan.token) >= 4; scan.scan() {

360

		// TODO: measure the impact of needing this conversion and redesign

361

		// the data structure if there is an issue.

362

		v, ok := variantIndex[string(scan.token)]

363

		if !ok {

364

			// unknown variant

365

			// TODO: allow user-defined variants?

366

			scan.gobble(NewValueError(scan.token))

367

			continue

368

369

		varID = append(varID, v)

370

		variant = append(variant, scan.token)

371

		if !needSort {

372

			if last < int(v) {

373

				last = int(v)

374

			} else {

375

				needSort = true

376

				// There is no legal combinations of more than 7 variants

377

				// (and this is by no means a useful sequence).

378

				const maxVariants = 8

379

				if len(varID) > maxVariants {

380

					break

381

382

383

384

		end = scan.end

385

386

	if needSort {

387

		sort.Sort(variantsSort{varID, variant})

388

		k, l := 0, -1

389

		for i, v := range varID {

390

			w := int(v)

391

			if l == w {

392

				// Remove duplicates.

393

				continue

394

395

			varID[k] = varID[i]

396

			variant[k] = variant[i]

397

k++

398

			l = w

399

400

		if str := bytes.Join(variant[:k], separator); len(str) == 0 {

401

			end = start - 1

402

		} else {

403

			scan.resizeRange(start, end, len(str))

404

			copy(scan.b[scan.start:], str)

405

			end = scan.end

406

407

408

	return end

409

410

411

type variantsSort struct {

412

	i []uint8

413

	v [][]byte

414

415

416

func (s variantsSort) Len() int {

417

	return len(s.i)

418

419

420

func (s variantsSort) Swap(i, j int) {

421

	s.i[i], s.i[j] = s.i[j], s.i[i]

422

	s.v[i], s.v[j] = s.v[j], s.v[i]

423

424

425

func (s variantsSort) Less(i, j int) bool {

426

	return s.i[i] < s.i[j]

427

428

429

type bytesSort struct {

430

	b [][]byte

431

	n int // first n bytes to compare

432

433

434

func (b bytesSort) Len() int {

435

	return len(b.b)

436

437

438

func (b bytesSort) Swap(i, j int) {

439

	b.b[i], b.b[j] = b.b[j], b.b[i]

440

441

442

func (b bytesSort) Less(i, j int) bool {

443

	for k := 0; k < b.n; k++ {

444

		if b.b[i][k] == b.b[j][k] {

445

			continue

446

447

		return b.b[i][k] < b.b[j][k]

448

449

	return false

450

451

452

// parseExtensions parses and normalizes the extensions in the buffer.

453

// It returns the last position of scan.b that is part of any extension.

454

// It also trims scan.b to remove excess parts accordingly.

455

func parseExtensions(scan *scanner) int {

456

	start := scan.start

457

	exts := [][]byte{}

458

	private := []byte{}

459

	end := scan.end

460

	for len(scan.token) == 1 {

461

		extStart := scan.start

462

		ext := scan.token[0]

463

		end = parseExtension(scan)

464

		extension := scan.b[extStart:end]

465

		if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {

466

			scan.setError(ErrSyntax)

467

			end = extStart

468

			continue

469

		} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {

470

			scan.b = scan.b[:end]

471

			return end

472

		} else if ext == 'x' {

473

			private = extension

474

			break

475

476

		exts = append(exts, extension)

477

478

	sort.Sort(bytesSort{exts, 1})

479

	if len(private) > 0 {

480

		exts = append(exts, private)

481

482

	scan.b = scan.b[:start]

483

	if len(exts) > 0 {

484

		scan.b = append(scan.b, bytes.Join(exts, separator)...)

485

	} else if start > 0 {

486

		// Strip trailing '-'.

487

		scan.b = scan.b[:start-1]

488

489

	return end

490

491

492

// parseExtension parses a single extension and returns the position of

493

// the extension end.

494

func parseExtension(scan *scanner) int {

495

	start, end := scan.start, scan.end

496

	switch scan.token[0] {

497

	case 'u': // https://www.ietf.org/rfc/rfc6067.txt

498

		attrStart := end

499

		scan.scan()

500

		for last := []byte{}; len(scan.token) > 2; scan.scan() {

501

			if bytes.Compare(scan.token, last) != -1 {

502

				// Attributes are unsorted. Start over from scratch.

503

				p := attrStart + 1

504

				scan.next = p

505

				attrs := [][]byte{}

506

				for scan.scan(); len(scan.token) > 2; scan.scan() {

507

					attrs = append(attrs, scan.token)

508

					end = scan.end

509

510

				sort.Sort(bytesSort{attrs, 3})

511

				copy(scan.b[p:], bytes.Join(attrs, separator))

512

				break

513

514

			last = scan.token

515

			end = scan.end

516

517

		// Scan key-type sequences. A key is of length 2 and may be followed

518

		// by 0 or more "type" subtags from 3 to the maximum of 8 letters.

519

		var last, key []byte

520

		for attrEnd := end; len(scan.token) == 2; last = key {

521

			key = scan.token

522

			end = scan.end

523

			for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {

524

				end = scan.end

525

526

			// TODO: check key value validity

527

			if bytes.Compare(key, last) != 1 || scan.err != nil {

528

				// We have an invalid key or the keys are not sorted.

529

				// Start scanning keys from scratch and reorder.

530

				p := attrEnd + 1

531

				scan.next = p

532

				keys := [][]byte{}

533

				for scan.scan(); len(scan.token) == 2; {

534

					keyStart := scan.start

535

					end = scan.end

536

					for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {

537

						end = scan.end

538

539

					keys = append(keys, scan.b[keyStart:end])

540

541

				sort.Stable(bytesSort{keys, 2})

542

				if n := len(keys); n > 0 {

543

					k := 0

544

					for i := 1; i < n; i++ {

545

						if !bytes.Equal(keys[k][:2], keys[i][:2]) {

546

k++

547

							keys[k] = keys[i]

548

						} else if !bytes.Equal(keys[k], keys[i]) {

549

							scan.setError(ErrDuplicateKey)

550

551

552

					keys = keys[:k+1]

553

554

				reordered := bytes.Join(keys, separator)

555

				if e := p + len(reordered); e < end {

556

					scan.deleteRange(e, end)

557

					end = e

558

559

				copy(scan.b[p:], reordered)

560

				break

561

562

563

	case 't': // https://www.ietf.org/rfc/rfc6497.txt

564

		scan.scan()

565

		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {

566

			_, end = parseTag(scan, false)

567

			scan.toLower(start, end)

568

569

		for len(scan.token) == 2 && !isAlpha(scan.token[1]) {

570

			end = scan.acceptMinSize(3)

571

572

	case 'x':

573

		end = scan.acceptMinSize(1)

574

	default:

575

		end = scan.acceptMinSize(2)

576

577

	return end

578

579

580

// getExtension returns the name, body and end position of the extension.

581

func getExtension(s string, p int) (end int, ext string) {

582

	if s[p] == '-' {

583

p++

584

585

	if s[p] == 'x' {

586

		return len(s), s[p:]

587

588

	end = nextExtension(s, p)

589

	return end, s[p:end]

590

591

592

// nextExtension finds the next extension within the string, searching

593

// for the -<char>- pattern from position p.

594

// In the fast majority of cases, language tags will have at most

595

// one extension and extensions tend to be small.

596

func nextExtension(s string, p int) int {

597

	for n := len(s) - 3; p < n; {

598

		if s[p] == '-' {

599

			if s[p+2] == '-' {

600

				return p

601

602

			p += 3

603

		} else {

604

p++

605

606

607

	return len(s)

608

1	// Copyright 2013 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	package language
6
7	import (
8	"bytes"
9	"errors"
10	"fmt"
11	"sort"
12
13	"golang.org/x/text/internal/tag"
14	)
15
16	// isAlpha returns true if the byte is not a digit.
17	// b must be an ASCII letter or digit.
18	func isAlpha(b byte) bool {
19	return b > '9'
20	}
21
22	// isAlphaNum returns true if the string contains only ASCII letters or digits.
23	func isAlphaNum(s []byte) bool {
24	for _, c := range s {
25	if !('a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' \|\| '0' <= c && c <= '9') {
26	return false
27	}
28	}
29	return true
30	}
31
32	// ErrSyntax is returned by any of the parsing functions when the
33	// input is not well-formed, according to BCP 47.
34	// TODO: return the position at which the syntax error occurred?
35	var ErrSyntax = errors.New("language: tag is not well-formed")
36
37	// ErrDuplicateKey is returned when a tag contains the same key twice with
38	// different values in the -u section.
39	var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
40
41	// ValueError is returned by any of the parsing functions when the
42	// input is well-formed but the respective subtag is not recognized
43	// as a valid value.
44	type ValueError struct {
45	v [8]byte
46	}
47
48	// NewValueError creates a new ValueError.
49	func NewValueError(tag []byte) ValueError {
50	var e ValueError
51	copy(e.v[:], tag)
52	return e
53	}
54
55	func (e ValueError) tag() []byte {
56	n := bytes.IndexByte(e.v[:], 0)
57	if n == -1 {
58	n = 8
59	}
60	return e.v[:n]
61	}
62
63	// Error implements the error interface.
64	func (e ValueError) Error() string {
65	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
66	}
67
68	// Subtag returns the subtag for which the error occurred.
69	func (e ValueError) Subtag() string {
70	return string(e.tag())
71	}
72
73	// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
74	type scanner struct {
75	b []byte
76	bytes [max99thPercentileSize]byte
77	token []byte
78	start int // start position of the current token
79	end int // end position of the current token
80	next int // next point for scan
81	err error
82	done bool
83	}
84
85	func makeScannerString(s string) scanner {
86	scan := scanner{}
87	if len(s) <= len(scan.bytes) {
88	scan.b = scan.bytes[:copy(scan.bytes[:], s)]
89	} else {
90	scan.b = []byte(s)
91	}
92	scan.init()
93	return scan
94	}
95
96	// makeScanner returns a scanner using b as the input buffer.
97	// b is not copied and may be modified by the scanner routines.
98	func makeScanner(b []byte) scanner {
99	scan := scanner{b: b}
100	scan.init()
101	return scan
102	}
103
104	func (s *scanner) init() {
105	for i, c := range s.b {
106	if c == '_' {
107	s.b[i] = '-'
108	}
109	}
110	s.scan()
111	}
112
113	// restToLower converts the string between start and end to lower case.
114	func (s *scanner) toLower(start, end int) {
115	for i := start; i < end; i++ {
116	c := s.b[i]
117	if 'A' <= c && c <= 'Z' {
118	s.b[i] += 'a' - 'A'
119	}
120	}
121	}
122
123	func (s *scanner) setError(e error) {
124	if s.err == nil \|\| (e == ErrSyntax && s.err != ErrSyntax) {
125	s.err = e
126	}
127	}
128
129	// resizeRange shrinks or grows the array at position oldStart such that
130	// a new string of size newSize can fit between oldStart and oldEnd.
131	// Sets the scan point to after the resized range.
132	func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
133	s.start = oldStart
134	if end := oldStart + newSize; end != oldEnd {
135	diff := end - oldEnd
136	var b []byte
137	if n := len(s.b) + diff; n > cap(s.b) {
138	b = make([]byte, n)
139	copy(b, s.b[:oldStart])
140	} else {
141	b = s.b[:n]
142	}
143	copy(b[end:], s.b[oldEnd:])
144	s.b = b
145	s.next = end + (s.next - s.end)
146	s.end = end
147	}
148	}
149
150	// replace replaces the current token with repl.
151	func (s *scanner) replace(repl string) {
152	s.resizeRange(s.start, s.end, len(repl))
153	copy(s.b[s.start:], repl)
154	}
155
156	// gobble removes the current token from the input.
157	// Caller must call scan after calling gobble.
158	func (s *scanner) gobble(e error) {
159	s.setError(e)
160	if s.start == 0 {
161	s.b = s.b[:+copy(s.b, s.b[s.next:])]
162	s.end = 0
163	} else {
164	s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
165	s.end = s.start - 1
166	}
167	s.next = s.start
168	}
169
170	// deleteRange removes the given range from s.b before the current token.
171	func (s *scanner) deleteRange(start, end int) {
172	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
173	diff := end - start
174	s.next -= diff
175	s.start -= diff
176	s.end -= diff
177	}
178
179	// scan parses the next token of a BCP 47 string. Tokens that are larger
180	// than 8 characters or include non-alphanumeric characters result in an error
181	// and are gobbled and removed from the output.
182	// It returns the end position of the last token consumed.
183	func (s *scanner) scan() (end int) {
184	end = s.end
185	s.token = nil
186	for s.start = s.next; s.next < len(s.b); {
187	i := bytes.IndexByte(s.b[s.next:], '-')
188	if i == -1 {
189	s.end = len(s.b)
190	s.next = len(s.b)
191	i = s.end - s.start
192	} else {
193	s.end = s.next + i
194	s.next = s.end + 1
195	}
196	token := s.b[s.start:s.end]
197	if i < 1 \|\| i > 8 \|\| !isAlphaNum(token) {
198	s.gobble(ErrSyntax)
199	continue
200	}
201	s.token = token
202	return end
203	}
204	if n := len(s.b); n > 0 && s.b[n-1] == '-' {
205	s.setError(ErrSyntax)
206	s.b = s.b[:len(s.b)-1]
207	}
208	s.done = true
209	return end
210	}
211
212	// acceptMinSize parses multiple tokens of the given size or greater.
213	// It returns the end position of the last token consumed.
214	func (s *scanner) acceptMinSize(min int) (end int) {
215	end = s.end
216	s.scan()
217	for ; len(s.token) >= min; s.scan() {
218	end = s.end
219	}
220	return end
221	}
222
223	// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
224	// failed it returns an error and any part of the tag that could be parsed.
225	// If parsing succeeded but an unknown value was found, it returns
226	// ValueError. The Tag returned in this case is just stripped of the unknown
227	// value. All other values are preserved. It accepts tags in the BCP 47 format
228	// and extensions to this standard defined in
229	// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
230	func Parse(s string) (t Tag, err error) {
231	// TODO: consider supporting old-style locale key-value pairs.
232	if s == "" {
233	return Und, ErrSyntax
234	}
235	defer func() {
236	if recover() != nil {
237	t = Und
238	err = ErrSyntax
239	return
240	}
241	}()
242	if len(s) <= maxAltTaglen {
243	b := [maxAltTaglen]byte{}
244	for i, c := range s {
245	// Generating invalid UTF-8 is okay as it won't match.
246	if 'A' <= c && c <= 'Z' {
247	c += 'a' - 'A'
248	} else if c == '_' {
249	c = '-'
250	}
251	b[i] = byte(c)
252	}
253	if t, ok := grandfathered(b); ok {
254	return t, nil
255	}
256	}
257	scan := makeScannerString(s)
258	return parse(&scan, s)
259	}
260
261	func parse(scan *scanner, s string) (t Tag, err error) {
262	t = Und
263	var end int
264	if n := len(scan.token); n <= 1 {
265	scan.toLower(0, len(scan.b))
266	if n == 0 \|\| scan.token[0] != 'x' {
267	return t, ErrSyntax
268	}
269	end = parseExtensions(scan)
270	} else if n >= 4 {
271	return Und, ErrSyntax
272	} else { // the usual case
273	t, end = parseTag(scan, true)
274	if n := len(scan.token); n == 1 {
275	t.pExt = uint16(end)
276	end = parseExtensions(scan)
277	} else if end < len(scan.b) {
278	scan.setError(ErrSyntax)
279	scan.b = scan.b[:end]
280	}
281	}
282	if int(t.pVariant) < len(scan.b) {
283	if end < len(s) {
284	s = s[:end]
285	}
286	if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
287	t.str = s
288	} else {
289	t.str = string(scan.b)
290	}
291	} else {
292	t.pVariant, t.pExt = 0, 0
293	}
294	return t, scan.err
295	}
296
297	// parseTag parses language, script, region and variants.
298	// It returns a Tag and the end position in the input that was parsed.
299	// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
300	func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
301	var e error
302	// TODO: set an error if an unknown lang, script or region is encountered.
303	t.LangID, e = getLangID(scan.token)
304	scan.setError(e)
305	scan.replace(t.LangID.String())
306	langStart := scan.start
307	end = scan.scan()
308	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
309	// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
310	// to a tag of the form <extlang>.
311	if doNorm {
312	lang, e := getLangID(scan.token)
313	if lang != 0 {
314	t.LangID = lang
315	langStr := lang.String()
316	copy(scan.b[langStart:], langStr)
317	scan.b[langStart+len(langStr)] = '-'
318	scan.start = langStart + len(langStr) + 1
319	}
320	scan.gobble(e)
321	}
322	end = scan.scan()
323	}
324	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
325	t.ScriptID, e = getScriptID(script, scan.token)
326	if t.ScriptID == 0 {
327	scan.gobble(e)
328	}
329	end = scan.scan()
330	}
331	if n := len(scan.token); n >= 2 && n <= 3 {
332	t.RegionID, e = getRegionID(scan.token)
333	if t.RegionID == 0 {
334	scan.gobble(e)
335	} else {
336	scan.replace(t.RegionID.String())
337	}
338	end = scan.scan()
339	}
340	scan.toLower(scan.start, len(scan.b))
341	t.pVariant = byte(end)
342	end = parseVariants(scan, end, t)
343	t.pExt = uint16(end)
344	return t, end
345	}
346
347	var separator = []byte{'-'}
348
349	// parseVariants scans tokens as long as each token is a valid variant string.
350	// Duplicate variants are removed.
351	func parseVariants(scan *scanner, end int, t Tag) int {
352	start := scan.start
353	varIDBuf := [4]uint8{}
354	variantBuf := [4][]byte{}
355	varID := varIDBuf[:0]
356	variant := variantBuf[:0]
357	last := -1
358	needSort := false
359	for ; len(scan.token) >= 4; scan.scan() {
360	// TODO: measure the impact of needing this conversion and redesign
361	// the data structure if there is an issue.
362	v, ok := variantIndex[string(scan.token)]
363	if !ok {
364	// unknown variant
365	// TODO: allow user-defined variants?
366	scan.gobble(NewValueError(scan.token))
367	continue
368	}
369	varID = append(varID, v)
370	variant = append(variant, scan.token)
371	if !needSort {
372	if last < int(v) {
373	last = int(v)
374	} else {
375	needSort = true
376	// There is no legal combinations of more than 7 variants
377	// (and this is by no means a useful sequence).
378	const maxVariants = 8
379	if len(varID) > maxVariants {
380	break
381	}
382	}
383	}
384	end = scan.end
385	}
386	if needSort {
387	sort.Sort(variantsSort{varID, variant})
388	k, l := 0, -1
389	for i, v := range varID {
390	w := int(v)
391	if l == w {
392	// Remove duplicates.
393	continue
394	}
395	varID[k] = varID[i]
396	variant[k] = variant[i]
397	k++
398	l = w
399	}
400	if str := bytes.Join(variant[:k], separator); len(str) == 0 {
401	end = start - 1
402	} else {
403	scan.resizeRange(start, end, len(str))
404	copy(scan.b[scan.start:], str)
405	end = scan.end
406	}
407	}
408	return end
409	}
410
411	type variantsSort struct {
412	i []uint8
413	v [][]byte
414	}
415
416	func (s variantsSort) Len() int {
417	return len(s.i)
418	}
419
420	func (s variantsSort) Swap(i, j int) {
421	s.i[i], s.i[j] = s.i[j], s.i[i]
422	s.v[i], s.v[j] = s.v[j], s.v[i]
423	}
424
425	func (s variantsSort) Less(i, j int) bool {
426	return s.i[i] < s.i[j]
427	}
428
429	type bytesSort struct {
430	b [][]byte
431	n int // first n bytes to compare
432	}
433
434	func (b bytesSort) Len() int {
435	return len(b.b)
436	}
437
438	func (b bytesSort) Swap(i, j int) {
439	b.b[i], b.b[j] = b.b[j], b.b[i]
440	}
441
442	func (b bytesSort) Less(i, j int) bool {
443	for k := 0; k < b.n; k++ {
444	if b.b[i][k] == b.b[j][k] {
445	continue
446	}
447	return b.b[i][k] < b.b[j][k]
448	}
449	return false
450	}
451
452	// parseExtensions parses and normalizes the extensions in the buffer.
453	// It returns the last position of scan.b that is part of any extension.
454	// It also trims scan.b to remove excess parts accordingly.
455	func parseExtensions(scan *scanner) int {
456	start := scan.start
457	exts := [][]byte{}
458	private := []byte{}
459	end := scan.end
460	for len(scan.token) == 1 {
461	extStart := scan.start
462	ext := scan.token[0]
463	end = parseExtension(scan)
464	extension := scan.b[extStart:end]
465	if len(extension) < 3 \|\| (ext != 'x' && len(extension) < 4) {
466	scan.setError(ErrSyntax)
467	end = extStart
468	continue
469	} else if start == extStart && (ext == 'x' \|\| scan.start == len(scan.b)) {
470	scan.b = scan.b[:end]
471	return end
472	} else if ext == 'x' {
473	private = extension
474	break
475	}
476	exts = append(exts, extension)
477	}
478	sort.Sort(bytesSort{exts, 1})
479	if len(private) > 0 {
480	exts = append(exts, private)
481	}
482	scan.b = scan.b[:start]
483	if len(exts) > 0 {
484	scan.b = append(scan.b, bytes.Join(exts, separator)...)
485	} else if start > 0 {
486	// Strip trailing '-'.
487	scan.b = scan.b[:start-1]
488	}
489	return end
490	}
491
492	// parseExtension parses a single extension and returns the position of
493	// the extension end.
494	func parseExtension(scan *scanner) int {
495	start, end := scan.start, scan.end
496	switch scan.token[0] {
497	case 'u': // https://www.ietf.org/rfc/rfc6067.txt
498	attrStart := end
499	scan.scan()
500	for last := []byte{}; len(scan.token) > 2; scan.scan() {
501	if bytes.Compare(scan.token, last) != -1 {
502	// Attributes are unsorted. Start over from scratch.
503	p := attrStart + 1
504	scan.next = p
505	attrs := [][]byte{}
506	for scan.scan(); len(scan.token) > 2; scan.scan() {
507	attrs = append(attrs, scan.token)
508	end = scan.end
509	}
510	sort.Sort(bytesSort{attrs, 3})
511	copy(scan.b[p:], bytes.Join(attrs, separator))
512	break
513	}
514	last = scan.token
515	end = scan.end
516	}
517	// Scan key-type sequences. A key is of length 2 and may be followed
518	// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
519	var last, key []byte
520	for attrEnd := end; len(scan.token) == 2; last = key {
521	key = scan.token
522	end = scan.end
523	for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
524	end = scan.end
525	}
526	// TODO: check key value validity
527	if bytes.Compare(key, last) != 1 \|\| scan.err != nil {
528	// We have an invalid key or the keys are not sorted.
529	// Start scanning keys from scratch and reorder.
530	p := attrEnd + 1
531	scan.next = p
532	keys := [][]byte{}
533	for scan.scan(); len(scan.token) == 2; {
534	keyStart := scan.start
535	end = scan.end
536	for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
537	end = scan.end
538	}
539	keys = append(keys, scan.b[keyStart:end])
540	}
541	sort.Stable(bytesSort{keys, 2})
542	if n := len(keys); n > 0 {
543	k := 0
544	for i := 1; i < n; i++ {
545	if !bytes.Equal(keys[k][:2], keys[i][:2]) {
546	k++
547	keys[k] = keys[i]
548	} else if !bytes.Equal(keys[k], keys[i]) {
549	scan.setError(ErrDuplicateKey)
550	}
551	}
552	keys = keys[:k+1]
553	}
554	reordered := bytes.Join(keys, separator)
555	if e := p + len(reordered); e < end {
556	scan.deleteRange(e, end)
557	end = e
558	}
559	copy(scan.b[p:], reordered)
560	break
561	}
562	}
563	case 't': // https://www.ietf.org/rfc/rfc6497.txt
564	scan.scan()
565	if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
566	_, end = parseTag(scan, false)
567	scan.toLower(start, end)
568	}
569	for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
570	end = scan.acceptMinSize(3)
571	}
572	case 'x':
573	end = scan.acceptMinSize(1)
574	default:
575	end = scan.acceptMinSize(2)
576	}
577	return end
578	}
579
580	// getExtension returns the name, body and end position of the extension.
581	func getExtension(s string, p int) (end int, ext string) {
582	if s[p] == '-' {
583	p++
584	}
585	if s[p] == 'x' {
586	return len(s), s[p:]
587	}
588	end = nextExtension(s, p)
589	return end, s[p:end]
590	}
591
592	// nextExtension finds the next extension within the string, searching
593	// for the -<char>- pattern from position p.
594	// In the fast majority of cases, language tags will have at most
595	// one extension and extensions tend to be small.
596	func nextExtension(s string, p int) int {
597	for n := len(s) - 3; p < n; {
598	if s[p] == '-' {
599	if s[p+2] == '-' {
600	return p
601	}
602	p += 3
603	} else {
604	p++
605	}
606	}
607	return len(s)
608	}