rss-tools: vendor/golang.org/x/text/encoding/unicode/unicode.go (master)

1

// Copyright 2013 The Go Authors. All rights reserved.

2

// Use of this source code is governed by a BSD-style

3

// license that can be found in the LICENSE file.

4

5

// Package unicode provides Unicode encodings such as UTF-16.

6

package unicode // import "golang.org/x/text/encoding/unicode"

7

8

import (

9

	"bytes"

10

	"errors"

11

	"unicode/utf16"

12

	"unicode/utf8"

13

14

	"golang.org/x/text/encoding"

15

	"golang.org/x/text/encoding/internal"

16

	"golang.org/x/text/encoding/internal/identifier"

17

	"golang.org/x/text/internal/utf8internal"

18

	"golang.org/x/text/runes"

19

	"golang.org/x/text/transform"

20

21

22

// TODO: I think the Transformers really should return errors on unmatched

23

// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,

24

// which leaves it open, but is suggested by WhatWG. It will allow for all error

25

// modes as defined by WhatWG: fatal, HTML and Replacement. This would require

26

// the introduction of some kind of error type for conveying the erroneous code

27

// point.

28

29

// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.

30

var UTF8 encoding.Encoding = utf8enc

31

32

// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order

33

// mark while the encoder adds one.

34

//

35

// Some editors add a byte order mark as a signature to UTF-8 files. Although

36

// the byte order mark is not useful for detecting byte order in UTF-8, it is

37

// sometimes used as a convention to mark UTF-8-encoded files. This relies on

38

// the observation that the UTF-8 byte order mark is either an illegal or at

39

// least very unlikely sequence in any other character encoding.

40

var UTF8BOM encoding.Encoding = utf8bomEncoding{}

41

42

type utf8bomEncoding struct{}

43

44

func (utf8bomEncoding) String() string {

45

	return "UTF-8-BOM"

46

47

48

func (utf8bomEncoding) ID() (identifier.MIB, string) {

49

	return identifier.Unofficial, "x-utf8bom"

50

51

52

func (utf8bomEncoding) NewEncoder() *encoding.Encoder {

53

	return &encoding.Encoder{

54

		Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},

55

56

57

58

func (utf8bomEncoding) NewDecoder() *encoding.Decoder {

59

	return &encoding.Decoder{Transformer: &utf8bomDecoder{}}

60

61

62

var utf8enc = &internal.Encoding{

63

	Encoding: &internal.SimpleEncoding{Decoder: utf8Decoder{}, Encoder: runes.ReplaceIllFormed()},

64

	Name:     "UTF-8",

65

	MIB:      identifier.UTF8,

66

67

68

type utf8bomDecoder struct {

69

	checked bool

70

71

72

func (t *utf8bomDecoder) Reset() {

73

	t.checked = false

74

75

76

func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

77

	if !t.checked {

78

		if !atEOF && len(src) < len(utf8BOM) {

79

			if len(src) == 0 {

80

				return 0, 0, nil

81

82

			return 0, 0, transform.ErrShortSrc

83

84

		if bytes.HasPrefix(src, []byte(utf8BOM)) {

85

			nSrc += len(utf8BOM)

86

			src = src[len(utf8BOM):]

87

88

		t.checked = true

89

90

	nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)

91

	nSrc += n

92

	return nDst, nSrc, err

93

94

95

type utf8bomEncoder struct {

96

	written bool

97

	t       transform.Transformer

98

99

100

func (t *utf8bomEncoder) Reset() {

101

	t.written = false

102

	t.t.Reset()

103

104

105

func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

106

	if !t.written {

107

		if len(dst) < len(utf8BOM) {

108

			return nDst, 0, transform.ErrShortDst

109

110

		nDst = copy(dst, utf8BOM)

111

		t.written = true

112

113

	n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)

114

	nDst += n

115

	return nDst, nSrc, err

116

117

118

type utf8Decoder struct{ transform.NopResetter }

119

120

func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

121

	var pSrc int // point from which to start copy in src

122

	var accept utf8internal.AcceptRange

123

124

	// The decoder can only make the input larger, not smaller.

125

	n := len(src)

126

	if len(dst) < n {

127

		err = transform.ErrShortDst

128

		n = len(dst)

129

		atEOF = false

130

131

	for nSrc < n {

132

		c := src[nSrc]

133

		if c < utf8.RuneSelf {

134

			nSrc++

135

			continue

136

137

		first := utf8internal.First[c]

138

		size := int(first & utf8internal.SizeMask)

139

		if first == utf8internal.FirstInvalid {

140

			goto handleInvalid // invalid starter byte

141

142

		accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]

143

		if nSrc+size > n {

144

			if !atEOF {

145

				// We may stop earlier than necessary here if the short sequence

146

				// has invalid bytes. Not checking for this simplifies the code

147

				// and may avoid duplicate computations in certain conditions.

148

				if err == nil {

149

					err = transform.ErrShortSrc

150

151

				break

152

153

			// Determine the maximal subpart of an ill-formed subsequence.

154

			switch {

155

			case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:

156

				size = 1

157

			case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:

158

				size = 2

159

			default:

160

				size = 3 // As we are short, the maximum is 3.

161

162

			goto handleInvalid

163

164

		if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {

165

			size = 1

166

			goto handleInvalid // invalid continuation byte

167

		} else if size == 2 {

168

		} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {

169

			size = 2

170

			goto handleInvalid // invalid continuation byte

171

		} else if size == 3 {

172

		} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {

173

			size = 3

174

			goto handleInvalid // invalid continuation byte

175

176

		nSrc += size

177

		continue

178

179

	handleInvalid:

180

		// Copy the scanned input so far.

181

		nDst += copy(dst[nDst:], src[pSrc:nSrc])

182

183

		// Append RuneError to the destination.

184

		const runeError = "\ufffd"

185

		if nDst+len(runeError) > len(dst) {

186

			return nDst, nSrc, transform.ErrShortDst

187

188

		nDst += copy(dst[nDst:], runeError)

189

190

		// Skip the maximal subpart of an ill-formed subsequence according to

191

		// the W3C standard way instead of the Go way. This Transform is

192

		// probably the only place in the text repo where it is warranted.

193

		nSrc += size

194

		pSrc = nSrc

195

196

		// Recompute the maximum source length.

197

		if sz := len(dst) - nDst; sz < len(src)-nSrc {

198

			err = transform.ErrShortDst

199

			n = nSrc + sz

200

			atEOF = false

201

202

203

	return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err

204

205

206

// UTF16 returns a UTF-16 Encoding for the given default endianness and byte

207

// order mark (BOM) policy.

208

//

209

// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then

210

// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect

211

// the endianness used for decoding, and will instead be output as their

212

// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy

213

// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.

214

// Instead, it overrides the default endianness e for the remainder of the

215

// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not

216

// affect the endianness used, and will instead be output as their standard

217

// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed

218

// with the default Endianness. For ExpectBOM, in that case, the transformation

219

// will return early with an ErrMissingBOM error.

220

//

221

// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of

222

// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not

223

// be inserted. The UTF-8 input does not need to contain a BOM.

224

//

225

// There is no concept of a 'native' endianness. If the UTF-16 data is produced

226

// and consumed in a greater context that implies a certain endianness, use

227

// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.

228

//

229

// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM

230

// corresponds to "Where the precise type of the data stream is known... the

231

// BOM should not be used" and ExpectBOM corresponds to "A particular

232

// protocol... may require use of the BOM".

233

func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {

234

	return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}

235

236

237

// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that

238

// some configurations map to the same MIB identifier. RFC 2781 has requirements

239

// and recommendations. Some of the "configurations" are merely recommendations,

240

// so multiple configurations could match.

241

var mibValue = map[Endianness][numBOMValues]identifier.MIB{

242

	BigEndian: [numBOMValues]identifier.MIB{

243

		IgnoreBOM: identifier.UTF16BE,

244

		UseBOM:    identifier.UTF16, // BigEnding default is preferred by RFC 2781.

245

		// TODO: acceptBOM | strictBOM would map to UTF16BE as well.

246

},

247

	LittleEndian: [numBOMValues]identifier.MIB{

248

		IgnoreBOM: identifier.UTF16LE,

249

		UseBOM:    identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.

250

		// TODO: acceptBOM | strictBOM would map to UTF16LE as well.

251

},

252

	// ExpectBOM is not widely used and has no valid MIB identifier.

253

254

255

// All lists a configuration for each IANA-defined UTF-16 variant.

256

var All = []encoding.Encoding{

257

	UTF8,

258

	UTF16(BigEndian, UseBOM),

259

	UTF16(BigEndian, IgnoreBOM),

260

	UTF16(LittleEndian, IgnoreBOM),

261

262

263

// BOMPolicy is a UTF-16 encoding's byte order mark policy.

264

type BOMPolicy uint8

265

266

const (

267

	writeBOM   BOMPolicy = 0x01

268

	acceptBOM  BOMPolicy = 0x02

269

	requireBOM BOMPolicy = 0x04

270

	bomMask    BOMPolicy = 0x07

271

272

	// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a

273

	// map of an array of length 8 of a type that is also used as a key or value

274

	// in another map). See golang.org/issue/11354.

275

	// TODO: consider changing this value back to 8 if the use of 1.4.* has

276

	// been minimized.

277

	numBOMValues = 8 + 1

278

279

	// IgnoreBOM means to ignore any byte order marks.

280

	IgnoreBOM BOMPolicy = 0

281

	// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.

282

283

	// UseBOM means that the UTF-16 form may start with a byte order mark, which

284

	// will be used to override the default encoding.

285

	UseBOM BOMPolicy = writeBOM | acceptBOM

286

	// Common and RFC 2781-compliant interpretation for UTF-16.

287

288

	// ExpectBOM means that the UTF-16 form must start with a byte order mark,

289

	// which will be used to override the default encoding.

290

	ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM

291

	// Used in Java as Unicode (not to be confused with Java's UTF-16) and

292

	// ICU's UTF-16,version=1. Not compliant with RFC 2781.

293

294

	// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:

295

	// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM

296

	//    (UnicodeBig and UnicodeLittle in Java)

297

	// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:

298

	//    acceptBOM | strictBOM (e.g. assigned to CheckBOM).

299

	// This addition would be consistent with supporting ExpectBOM.

300

301

302

// Endianness is a UTF-16 encoding's default endianness.

303

type Endianness bool

304

305

const (

306

	// BigEndian is UTF-16BE.

307

	BigEndian Endianness = false

308

	// LittleEndian is UTF-16LE.

309

	LittleEndian Endianness = true

310

311

312

// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a

313

// starting byte order mark.

314

var ErrMissingBOM = errors.New("encoding: missing byte order mark")

315

316

type utf16Encoding struct {

317

	config

318

	mib identifier.MIB

319

320

321

type config struct {

322

	endianness Endianness

323

	bomPolicy  BOMPolicy

324

325

326

func (u utf16Encoding) NewDecoder() *encoding.Decoder {

327

	return &encoding.Decoder{Transformer: &utf16Decoder{

328

		initial: u.config,

329

		current: u.config,

330

}}

331

332

333

func (u utf16Encoding) NewEncoder() *encoding.Encoder {

334

	return &encoding.Encoder{Transformer: &utf16Encoder{

335

		endianness:       u.endianness,

336

		initialBOMPolicy: u.bomPolicy,

337

		currentBOMPolicy: u.bomPolicy,

338

}}

339

340

341

func (u utf16Encoding) ID() (mib identifier.MIB, other string) {

342

	return u.mib, ""

343

344

345

func (u utf16Encoding) String() string {

346

	e, b := "B", ""

347

	if u.endianness == LittleEndian {

348

		e = "L"

349

350

	switch u.bomPolicy {

351

	case ExpectBOM:

352

		b = "Expect"

353

	case UseBOM:

354

		b = "Use"

355

	case IgnoreBOM:

356

		b = "Ignore"

357

358

	return "UTF-16" + e + "E (" + b + " BOM)"

359

360

361

type utf16Decoder struct {

362

	initial config

363

	current config

364

365

366

func (u *utf16Decoder) Reset() {

367

	u.current = u.initial

368

369

370

func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

371

	if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {

372

		return 0, 0, ErrMissingBOM

373

374

	if len(src) == 0 {

375

		return 0, 0, nil

376

377

	if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {

378

		switch {

379

		case src[0] == 0xfe && src[1] == 0xff:

380

			u.current.endianness = BigEndian

381

			nSrc = 2

382

		case src[0] == 0xff && src[1] == 0xfe:

383

			u.current.endianness = LittleEndian

384

			nSrc = 2

385

		default:

386

			if u.current.bomPolicy&requireBOM != 0 {

387

				return 0, 0, ErrMissingBOM

388

389

390

		u.current.bomPolicy = IgnoreBOM

391

392

393

	var r rune

394

	var dSize, sSize int

395

	for nSrc < len(src) {

396

		if nSrc+1 < len(src) {

397

			x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])

398

			if u.current.endianness == LittleEndian {

399

				x = x>>8 | x<<8

400

401

			r, sSize = rune(x), 2

402

			if utf16.IsSurrogate(r) {

403

				if nSrc+3 < len(src) {

404

					x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])

405

					if u.current.endianness == LittleEndian {

406

						x = x>>8 | x<<8

407

408

					// Save for next iteration if it is not a high surrogate.

409

					if isHighSurrogate(rune(x)) {

410

						r, sSize = utf16.DecodeRune(r, rune(x)), 4

411

412

				} else if !atEOF {

413

					err = transform.ErrShortSrc

414

					break

415

416

417

			if dSize = utf8.RuneLen(r); dSize < 0 {

418

				r, dSize = utf8.RuneError, 3

419

420

		} else if atEOF {

421

			// Single trailing byte.

422

			r, dSize, sSize = utf8.RuneError, 3, 1

423

		} else {

424

			err = transform.ErrShortSrc

425

			break

426

427

		if nDst+dSize > len(dst) {

428

			err = transform.ErrShortDst

429

			break

430

431

		nDst += utf8.EncodeRune(dst[nDst:], r)

432

		nSrc += sSize

433

434

	return nDst, nSrc, err

435

436

437

func isHighSurrogate(r rune) bool {

438

	return 0xDC00 <= r && r <= 0xDFFF

439

440

441

type utf16Encoder struct {

442

	endianness       Endianness

443

	initialBOMPolicy BOMPolicy

444

	currentBOMPolicy BOMPolicy

445

446

447

func (u *utf16Encoder) Reset() {

448

	u.currentBOMPolicy = u.initialBOMPolicy

449

450

451

func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

452

	if u.currentBOMPolicy&writeBOM != 0 {

453

		if len(dst) < 2 {

454

			return 0, 0, transform.ErrShortDst

455

456

		dst[0], dst[1] = 0xfe, 0xff

457

		u.currentBOMPolicy = IgnoreBOM

458

		nDst = 2

459

460

461

	r, size := rune(0), 0

462

	for nSrc < len(src) {

463

		r = rune(src[nSrc])

464

465

		// Decode a 1-byte rune.

466

		if r < utf8.RuneSelf {

467

			size = 1

468

469

		} else {

470

			// Decode a multi-byte rune.

471

			r, size = utf8.DecodeRune(src[nSrc:])

472

			if size == 1 {

473

				// All valid runes of size 1 (those below utf8.RuneSelf) were

474

				// handled above. We have invalid UTF-8 or we haven't seen the

475

				// full character yet.

476

				if !atEOF && !utf8.FullRune(src[nSrc:]) {

477

					err = transform.ErrShortSrc

478

					break

479

480

481

482

483

		if r <= 0xffff {

484

			if nDst+2 > len(dst) {

485

				err = transform.ErrShortDst

486

				break

487

488

			dst[nDst+0] = uint8(r >> 8)

489

			dst[nDst+1] = uint8(r)

490

			nDst += 2

491

		} else {

492

			if nDst+4 > len(dst) {

493

				err = transform.ErrShortDst

494

				break

495

496

			r1, r2 := utf16.EncodeRune(r)

497

			dst[nDst+0] = uint8(r1 >> 8)

498

			dst[nDst+1] = uint8(r1)

499

			dst[nDst+2] = uint8(r2 >> 8)

500

			dst[nDst+3] = uint8(r2)

501

			nDst += 4

502

503

		nSrc += size

504

505

506

	if u.endianness == LittleEndian {

507

		for i := 0; i < nDst; i += 2 {

508

			dst[i], dst[i+1] = dst[i+1], dst[i]

509

510

511

	return nDst, nSrc, err

512

1	// Copyright 2013 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Package unicode provides Unicode encodings such as UTF-16.
6	package unicode // import "golang.org/x/text/encoding/unicode"
7
8	import (
9	"bytes"
10	"errors"
11	"unicode/utf16"
12	"unicode/utf8"
13
14	"golang.org/x/text/encoding"
15	"golang.org/x/text/encoding/internal"
16	"golang.org/x/text/encoding/internal/identifier"
17	"golang.org/x/text/internal/utf8internal"
18	"golang.org/x/text/runes"
19	"golang.org/x/text/transform"
20	)
21
22	// TODO: I think the Transformers really should return errors on unmatched
23	// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
24	// which leaves it open, but is suggested by WhatWG. It will allow for all error
25	// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
26	// the introduction of some kind of error type for conveying the erroneous code
27	// point.
28
29	// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
30	var UTF8 encoding.Encoding = utf8enc
31
32	// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
33	// mark while the encoder adds one.
34	//
35	// Some editors add a byte order mark as a signature to UTF-8 files. Although
36	// the byte order mark is not useful for detecting byte order in UTF-8, it is
37	// sometimes used as a convention to mark UTF-8-encoded files. This relies on
38	// the observation that the UTF-8 byte order mark is either an illegal or at
39	// least very unlikely sequence in any other character encoding.
40	var UTF8BOM encoding.Encoding = utf8bomEncoding{}
41
42	type utf8bomEncoding struct{}
43
44	func (utf8bomEncoding) String() string {
45	return "UTF-8-BOM"
46	}
47
48	func (utf8bomEncoding) ID() (identifier.MIB, string) {
49	return identifier.Unofficial, "x-utf8bom"
50	}
51
52	func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
53	return &encoding.Encoder{
54	Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
55	}
56	}
57
58	func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
59	return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
60	}
61
62	var utf8enc = &internal.Encoding{
63	Encoding: &internal.SimpleEncoding{Decoder: utf8Decoder{}, Encoder: runes.ReplaceIllFormed()},
64	Name: "UTF-8",
65	MIB: identifier.UTF8,
66	}
67
68	type utf8bomDecoder struct {
69	checked bool
70	}
71
72	func (t *utf8bomDecoder) Reset() {
73	t.checked = false
74	}
75
76	func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
77	if !t.checked {
78	if !atEOF && len(src) < len(utf8BOM) {
79	if len(src) == 0 {
80	return 0, 0, nil
81	}
82	return 0, 0, transform.ErrShortSrc
83	}
84	if bytes.HasPrefix(src, []byte(utf8BOM)) {
85	nSrc += len(utf8BOM)
86	src = src[len(utf8BOM):]
87	}
88	t.checked = true
89	}
90	nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
91	nSrc += n
92	return nDst, nSrc, err
93	}
94
95	type utf8bomEncoder struct {
96	written bool
97	t transform.Transformer
98	}
99
100	func (t *utf8bomEncoder) Reset() {
101	t.written = false
102	t.t.Reset()
103	}
104
105	func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
106	if !t.written {
107	if len(dst) < len(utf8BOM) {
108	return nDst, 0, transform.ErrShortDst
109	}
110	nDst = copy(dst, utf8BOM)
111	t.written = true
112	}
113	n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
114	nDst += n
115	return nDst, nSrc, err
116	}
117
118	type utf8Decoder struct{ transform.NopResetter }
119
120	func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
121	var pSrc int // point from which to start copy in src
122	var accept utf8internal.AcceptRange
123
124	// The decoder can only make the input larger, not smaller.
125	n := len(src)
126	if len(dst) < n {
127	err = transform.ErrShortDst
128	n = len(dst)
129	atEOF = false
130	}
131	for nSrc < n {
132	c := src[nSrc]
133	if c < utf8.RuneSelf {
134	nSrc++
135	continue
136	}
137	first := utf8internal.First[c]
138	size := int(first & utf8internal.SizeMask)
139	if first == utf8internal.FirstInvalid {
140	goto handleInvalid // invalid starter byte
141	}
142	accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
143	if nSrc+size > n {
144	if !atEOF {
145	// We may stop earlier than necessary here if the short sequence
146	// has invalid bytes. Not checking for this simplifies the code
147	// and may avoid duplicate computations in certain conditions.
148	if err == nil {
149	err = transform.ErrShortSrc
150	}
151	break
152	}
153	// Determine the maximal subpart of an ill-formed subsequence.
154	switch {
155	case nSrc+1 >= n \|\| src[nSrc+1] < accept.Lo \|\| accept.Hi < src[nSrc+1]:
156	size = 1
157	case nSrc+2 >= n \|\| src[nSrc+2] < utf8internal.LoCB \|\| utf8internal.HiCB < src[nSrc+2]:
158	size = 2
159	default:
160	size = 3 // As we are short, the maximum is 3.
161	}
162	goto handleInvalid
163	}
164	if c = src[nSrc+1]; c < accept.Lo \|\| accept.Hi < c {
165	size = 1
166	goto handleInvalid // invalid continuation byte
167	} else if size == 2 {
168	} else if c = src[nSrc+2]; c < utf8internal.LoCB \|\| utf8internal.HiCB < c {
169	size = 2
170	goto handleInvalid // invalid continuation byte
171	} else if size == 3 {
172	} else if c = src[nSrc+3]; c < utf8internal.LoCB \|\| utf8internal.HiCB < c {
173	size = 3
174	goto handleInvalid // invalid continuation byte
175	}
176	nSrc += size
177	continue
178
179	handleInvalid:
180	// Copy the scanned input so far.
181	nDst += copy(dst[nDst:], src[pSrc:nSrc])
182
183	// Append RuneError to the destination.
184	const runeError = "\ufffd"
185	if nDst+len(runeError) > len(dst) {
186	return nDst, nSrc, transform.ErrShortDst
187	}
188	nDst += copy(dst[nDst:], runeError)
189
190	// Skip the maximal subpart of an ill-formed subsequence according to
191	// the W3C standard way instead of the Go way. This Transform is
192	// probably the only place in the text repo where it is warranted.
193	nSrc += size
194	pSrc = nSrc
195
196	// Recompute the maximum source length.
197	if sz := len(dst) - nDst; sz < len(src)-nSrc {
198	err = transform.ErrShortDst
199	n = nSrc + sz
200	atEOF = false
201	}
202	}
203	return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
204	}
205
206	// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
207	// order mark (BOM) policy.
208	//
209	// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
210	// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
211	// the endianness used for decoding, and will instead be output as their
212	// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
213	// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
214	// Instead, it overrides the default endianness e for the remainder of the
215	// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
216	// affect the endianness used, and will instead be output as their standard
217	// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
218	// with the default Endianness. For ExpectBOM, in that case, the transformation
219	// will return early with an ErrMissingBOM error.
220	//
221	// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
222	// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
223	// be inserted. The UTF-8 input does not need to contain a BOM.
224	//
225	// There is no concept of a 'native' endianness. If the UTF-16 data is produced
226	// and consumed in a greater context that implies a certain endianness, use
227	// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
228	//
229	// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
230	// corresponds to "Where the precise type of the data stream is known... the
231	// BOM should not be used" and ExpectBOM corresponds to "A particular
232	// protocol... may require use of the BOM".
233	func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
234	return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
235	}
236
237	// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
238	// some configurations map to the same MIB identifier. RFC 2781 has requirements
239	// and recommendations. Some of the "configurations" are merely recommendations,
240	// so multiple configurations could match.
241	var mibValue = map[Endianness][numBOMValues]identifier.MIB{
242	BigEndian: [numBOMValues]identifier.MIB{
243	IgnoreBOM: identifier.UTF16BE,
244	UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
245	// TODO: acceptBOM \| strictBOM would map to UTF16BE as well.
246	},
247	LittleEndian: [numBOMValues]identifier.MIB{
248	IgnoreBOM: identifier.UTF16LE,
249	UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
250	// TODO: acceptBOM \| strictBOM would map to UTF16LE as well.
251	},
252	// ExpectBOM is not widely used and has no valid MIB identifier.
253	}
254
255	// All lists a configuration for each IANA-defined UTF-16 variant.
256	var All = []encoding.Encoding{
257	UTF8,
258	UTF16(BigEndian, UseBOM),
259	UTF16(BigEndian, IgnoreBOM),
260	UTF16(LittleEndian, IgnoreBOM),
261	}
262
263	// BOMPolicy is a UTF-16 encoding's byte order mark policy.
264	type BOMPolicy uint8
265
266	const (
267	writeBOM BOMPolicy = 0x01
268	acceptBOM BOMPolicy = 0x02
269	requireBOM BOMPolicy = 0x04
270	bomMask BOMPolicy = 0x07
271
272	// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
273	// map of an array of length 8 of a type that is also used as a key or value
274	// in another map). See golang.org/issue/11354.
275	// TODO: consider changing this value back to 8 if the use of 1.4.* has
276	// been minimized.
277	numBOMValues = 8 + 1
278
279	// IgnoreBOM means to ignore any byte order marks.
280	IgnoreBOM BOMPolicy = 0
281	// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
282
283	// UseBOM means that the UTF-16 form may start with a byte order mark, which
284	// will be used to override the default encoding.
285	UseBOM BOMPolicy = writeBOM \| acceptBOM
286	// Common and RFC 2781-compliant interpretation for UTF-16.
287
288	// ExpectBOM means that the UTF-16 form must start with a byte order mark,
289	// which will be used to override the default encoding.
290	ExpectBOM BOMPolicy = writeBOM \| acceptBOM \| requireBOM
291	// Used in Java as Unicode (not to be confused with Java's UTF-16) and
292	// ICU's UTF-16,version=1. Not compliant with RFC 2781.
293
294	// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
295	// - UTF-16(B\|L)E,version=1: writeBOM \| acceptBOM \| requireBOM \| strictBOM
296	// (UnicodeBig and UnicodeLittle in Java)
297	// - RFC 2781-compliant, but less common interpretation for UTF-16(B\|L)E:
298	// acceptBOM \| strictBOM (e.g. assigned to CheckBOM).
299	// This addition would be consistent with supporting ExpectBOM.
300	)
301
302	// Endianness is a UTF-16 encoding's default endianness.
303	type Endianness bool
304
305	const (
306	// BigEndian is UTF-16BE.
307	BigEndian Endianness = false
308	// LittleEndian is UTF-16LE.
309	LittleEndian Endianness = true
310	)
311
312	// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
313	// starting byte order mark.
314	var ErrMissingBOM = errors.New("encoding: missing byte order mark")
315
316	type utf16Encoding struct {
317	config
318	mib identifier.MIB
319	}
320
321	type config struct {
322	endianness Endianness
323	bomPolicy BOMPolicy
324	}
325
326	func (u utf16Encoding) NewDecoder() *encoding.Decoder {
327	return &encoding.Decoder{Transformer: &utf16Decoder{
328	initial: u.config,
329	current: u.config,
330	}}
331	}
332
333	func (u utf16Encoding) NewEncoder() *encoding.Encoder {
334	return &encoding.Encoder{Transformer: &utf16Encoder{
335	endianness: u.endianness,
336	initialBOMPolicy: u.bomPolicy,
337	currentBOMPolicy: u.bomPolicy,
338	}}
339	}
340
341	func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
342	return u.mib, ""
343	}
344
345	func (u utf16Encoding) String() string {
346	e, b := "B", ""
347	if u.endianness == LittleEndian {
348	e = "L"
349	}
350	switch u.bomPolicy {
351	case ExpectBOM:
352	b = "Expect"
353	case UseBOM:
354	b = "Use"
355	case IgnoreBOM:
356	b = "Ignore"
357	}
358	return "UTF-16" + e + "E (" + b + " BOM)"
359	}
360
361	type utf16Decoder struct {
362	initial config
363	current config
364	}
365
366	func (u *utf16Decoder) Reset() {
367	u.current = u.initial
368	}
369
370	func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
371	if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {
372	return 0, 0, ErrMissingBOM
373	}
374	if len(src) == 0 {
375	return 0, 0, nil
376	}
377	if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {
378	switch {
379	case src[0] == 0xfe && src[1] == 0xff:
380	u.current.endianness = BigEndian
381	nSrc = 2
382	case src[0] == 0xff && src[1] == 0xfe:
383	u.current.endianness = LittleEndian
384	nSrc = 2
385	default:
386	if u.current.bomPolicy&requireBOM != 0 {
387	return 0, 0, ErrMissingBOM
388	}
389	}
390	u.current.bomPolicy = IgnoreBOM
391	}
392
393	var r rune
394	var dSize, sSize int
395	for nSrc < len(src) {
396	if nSrc+1 < len(src) {
397	x := uint16(src[nSrc+0])<<8 \| uint16(src[nSrc+1])
398	if u.current.endianness == LittleEndian {
399	x = x>>8 \| x<<8
400	}
401	r, sSize = rune(x), 2
402	if utf16.IsSurrogate(r) {
403	if nSrc+3 < len(src) {
404	x = uint16(src[nSrc+2])<<8 \| uint16(src[nSrc+3])
405	if u.current.endianness == LittleEndian {
406	x = x>>8 \| x<<8
407	}
408	// Save for next iteration if it is not a high surrogate.
409	if isHighSurrogate(rune(x)) {
410	r, sSize = utf16.DecodeRune(r, rune(x)), 4
411	}
412	} else if !atEOF {
413	err = transform.ErrShortSrc
414	break
415	}
416	}
417	if dSize = utf8.RuneLen(r); dSize < 0 {
418	r, dSize = utf8.RuneError, 3
419	}
420	} else if atEOF {
421	// Single trailing byte.
422	r, dSize, sSize = utf8.RuneError, 3, 1
423	} else {
424	err = transform.ErrShortSrc
425	break
426	}
427	if nDst+dSize > len(dst) {
428	err = transform.ErrShortDst
429	break
430	}
431	nDst += utf8.EncodeRune(dst[nDst:], r)
432	nSrc += sSize
433	}
434	return nDst, nSrc, err
435	}
436
437	func isHighSurrogate(r rune) bool {
438	return 0xDC00 <= r && r <= 0xDFFF
439	}
440
441	type utf16Encoder struct {
442	endianness Endianness
443	initialBOMPolicy BOMPolicy
444	currentBOMPolicy BOMPolicy
445	}
446
447	func (u *utf16Encoder) Reset() {
448	u.currentBOMPolicy = u.initialBOMPolicy
449	}
450
451	func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
452	if u.currentBOMPolicy&writeBOM != 0 {
453	if len(dst) < 2 {
454	return 0, 0, transform.ErrShortDst
455	}
456	dst[0], dst[1] = 0xfe, 0xff
457	u.currentBOMPolicy = IgnoreBOM
458	nDst = 2
459	}
460
461	r, size := rune(0), 0
462	for nSrc < len(src) {
463	r = rune(src[nSrc])
464
465	// Decode a 1-byte rune.
466	if r < utf8.RuneSelf {
467	size = 1
468
469	} else {
470	// Decode a multi-byte rune.
471	r, size = utf8.DecodeRune(src[nSrc:])
472	if size == 1 {
473	// All valid runes of size 1 (those below utf8.RuneSelf) were
474	// handled above. We have invalid UTF-8 or we haven't seen the
475	// full character yet.
476	if !atEOF && !utf8.FullRune(src[nSrc:]) {
477	err = transform.ErrShortSrc
478	break
479	}
480	}
481	}
482
483	if r <= 0xffff {
484	if nDst+2 > len(dst) {
485	err = transform.ErrShortDst
486	break
487	}
488	dst[nDst+0] = uint8(r >> 8)
489	dst[nDst+1] = uint8(r)
490	nDst += 2
491	} else {
492	if nDst+4 > len(dst) {
493	err = transform.ErrShortDst
494	break
495	}
496	r1, r2 := utf16.EncodeRune(r)
497	dst[nDst+0] = uint8(r1 >> 8)
498	dst[nDst+1] = uint8(r1)
499	dst[nDst+2] = uint8(r2 >> 8)
500	dst[nDst+3] = uint8(r2)
501	nDst += 4
502	}
503	nSrc += size
504	}
505
506	if u.endianness == LittleEndian {
507	for i := 0; i < nDst; i += 2 {
508	dst[i], dst[i+1] = dst[i+1], dst[i]
509	}
510	}
511	return nDst, nSrc, err
512	}