rss-tools: vendor/golang.org/x/text/encoding/encoding.go (master)

1

// Copyright 2013 The Go Authors. All rights reserved.

2

// Use of this source code is governed by a BSD-style

3

// license that can be found in the LICENSE file.

4

5

// Package encoding defines an interface for character encodings, such as Shift

6

// JIS and Windows 1252, that can convert to and from UTF-8.

7

//

8

// Encoding implementations are provided in other packages, such as

9

// golang.org/x/text/encoding/charmap and

10

// golang.org/x/text/encoding/japanese.

11

package encoding // import "golang.org/x/text/encoding"

12

13

import (

14

	"errors"

15

	"io"

16

	"strconv"

17

	"unicode/utf8"

18

19

	"golang.org/x/text/encoding/internal/identifier"

20

	"golang.org/x/text/transform"

21

22

23

// TODO:

24

// - There seems to be some inconsistency in when decoders return errors

25

//   and when not. Also documentation seems to suggest they shouldn't return

26

//   errors at all (except for UTF-16).

27

// - Encoders seem to rely on or at least benefit from the input being in NFC

28

//   normal form. Perhaps add an example how users could prepare their output.

29

30

// Encoding is a character set encoding that can be transformed to and from

31

// UTF-8.

32

type Encoding interface {

33

	// NewDecoder returns a Decoder.

34

	NewDecoder() *Decoder

35

36

	// NewEncoder returns an Encoder.

37

	NewEncoder() *Encoder

38

39

40

// A Decoder converts bytes to UTF-8. It implements transform.Transformer.

41

//

42

// Transforming source bytes that are not of that encoding will not result in an

43

// error per se. Each byte that cannot be transcoded will be represented in the

44

// output by the UTF-8 encoding of '\uFFFD', the replacement rune.

45

type Decoder struct {

46

	transform.Transformer

47

48

	// This forces external creators of Decoders to use names in struct

49

	// initializers, allowing for future extendibility without having to break

50

	// code.

51

	_ struct{}

52

53

54

// Bytes converts the given encoded bytes to UTF-8. It returns the converted

55

// bytes or nil, err if any error occurred.

56

func (d *Decoder) Bytes(b []byte) ([]byte, error) {

57

	b, _, err := transform.Bytes(d, b)

58

	if err != nil {

59

		return nil, err

60

61

	return b, nil

62

63

64

// String converts the given encoded string to UTF-8. It returns the converted

65

// string or "", err if any error occurred.

66

func (d *Decoder) String(s string) (string, error) {

67

	s, _, err := transform.String(d, s)

68

	if err != nil {

69

		return "", err

70

71

	return s, nil

72

73

74

// Reader wraps another Reader to decode its bytes.

75

//

76

// The Decoder may not be used for any other operation as long as the returned

77

// Reader is in use.

78

func (d *Decoder) Reader(r io.Reader) io.Reader {

79

	return transform.NewReader(r, d)

80

81

82

// An Encoder converts bytes from UTF-8. It implements transform.Transformer.

83

//

84

// Each rune that cannot be transcoded will result in an error. In this case,

85

// the transform will consume all source byte up to, not including the offending

86

// rune. Transforming source bytes that are not valid UTF-8 will be replaced by

87

// `\uFFFD`. To return early with an error instead, use transform.Chain to

88

// preprocess the data with a UTF8Validator.

89

type Encoder struct {

90

	transform.Transformer

91

92

	// This forces external creators of Encoders to use names in struct

93

	// initializers, allowing for future extendibility without having to break

94

	// code.

95

	_ struct{}

96

97

98

// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if

99

// any error occurred.

100

func (e *Encoder) Bytes(b []byte) ([]byte, error) {

101

	b, _, err := transform.Bytes(e, b)

102

	if err != nil {

103

		return nil, err

104

105

	return b, nil

106

107

108

// String converts a string from UTF-8. It returns the converted string or

109

// "", err if any error occurred.

110

func (e *Encoder) String(s string) (string, error) {

111

	s, _, err := transform.String(e, s)

112

	if err != nil {

113

		return "", err

114

115

	return s, nil

116

117

118

// Writer wraps another Writer to encode its UTF-8 output.

119

//

120

// The Encoder may not be used for any other operation as long as the returned

121

// Writer is in use.

122

func (e *Encoder) Writer(w io.Writer) io.Writer {

123

	return transform.NewWriter(w, e)

124

125

126

// ASCIISub is the ASCII substitute character, as recommended by

127

// https://unicode.org/reports/tr36/#Text_Comparison

128

const ASCIISub = '\x1a'

129

130

// Nop is the nop encoding. Its transformed bytes are the same as the source

131

// bytes; it does not replace invalid UTF-8 sequences.

132

var Nop Encoding = nop{}

133

134

type nop struct{}

135

136

func (nop) NewDecoder() *Decoder {

137

	return &Decoder{Transformer: transform.Nop}

138

139

func (nop) NewEncoder() *Encoder {

140

	return &Encoder{Transformer: transform.Nop}

141

142

143

// Replacement is the replacement encoding. Decoding from the replacement

144

// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to

145

// the replacement encoding yields the same as the source bytes except that

146

// invalid UTF-8 is converted to '\uFFFD'.

147

//

148

// It is defined at http://encoding.spec.whatwg.org/#replacement

149

var Replacement Encoding = replacement{}

150

151

type replacement struct{}

152

153

func (replacement) NewDecoder() *Decoder {

154

	return &Decoder{Transformer: replacementDecoder{}}

155

156

157

func (replacement) NewEncoder() *Encoder {

158

	return &Encoder{Transformer: replacementEncoder{}}

159

160

161

func (replacement) ID() (mib identifier.MIB, other string) {

162

	return identifier.Replacement, ""

163

164

165

type replacementDecoder struct{ transform.NopResetter }

166

167

func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

168

	if len(dst) < 3 {

169

		return 0, 0, transform.ErrShortDst

170

171

	if atEOF {

172

		const fffd = "\ufffd"

173

		dst[0] = fffd[0]

174

		dst[1] = fffd[1]

175

		dst[2] = fffd[2]

176

		nDst = 3

177

178

	return nDst, len(src), nil

179

180

181

type replacementEncoder struct{ transform.NopResetter }

182

183

func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

184

	r, size := rune(0), 0

185

186

	for ; nSrc < len(src); nSrc += size {

187

		r = rune(src[nSrc])

188

189

		// Decode a 1-byte rune.

190

		if r < utf8.RuneSelf {

191

			size = 1

192

193

		} else {

194

			// Decode a multi-byte rune.

195

			r, size = utf8.DecodeRune(src[nSrc:])

196

			if size == 1 {

197

				// All valid runes of size 1 (those below utf8.RuneSelf) were

198

				// handled above. We have invalid UTF-8 or we haven't seen the

199

				// full character yet.

200

				if !atEOF && !utf8.FullRune(src[nSrc:]) {

201

					err = transform.ErrShortSrc

202

					break

203

204

				r = '\ufffd'

205

206

207

208

		if nDst+utf8.RuneLen(r) > len(dst) {

209

			err = transform.ErrShortDst

210

			break

211

212

		nDst += utf8.EncodeRune(dst[nDst:], r)

213

214

	return nDst, nSrc, err

215

216

217

// HTMLEscapeUnsupported wraps encoders to replace source runes outside the

218

// repertoire of the destination encoding with HTML escape sequences.

219

//

220

// This wrapper exists to comply to URL and HTML forms requiring a

221

// non-terminating legacy encoder. The produced sequences may lead to data

222

// loss as they are indistinguishable from legitimate input. To avoid this

223

// issue, use UTF-8 encodings whenever possible.

224

func HTMLEscapeUnsupported(e *Encoder) *Encoder {

225

	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}

226

227

228

// ReplaceUnsupported wraps encoders to replace source runes outside the

229

// repertoire of the destination encoding with an encoding-specific

230

// replacement.

231

//

232

// This wrapper is only provided for backwards compatibility and legacy

233

// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.

234

func ReplaceUnsupported(e *Encoder) *Encoder {

235

	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}

236

237

238

type errorHandler struct {

239

	*Encoder

240

	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)

241

242

243

// TODO: consider making this error public in some form.

244

type repertoireError interface {

245

	Replacement() byte

246

247

248

func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

249

	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)

250

	for err != nil {

251

		rerr, ok := err.(repertoireError)

252

		if !ok {

253

			return nDst, nSrc, err

254

255

		r, sz := utf8.DecodeRune(src[nSrc:])

256

		n, ok := h.handler(dst[nDst:], r, rerr)

257

		if !ok {

258

			return nDst, nSrc, transform.ErrShortDst

259

260

		err = nil

261

		nDst += n

262

		if nSrc += sz; nSrc < len(src) {

263

			var dn, sn int

264

			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)

265

			nDst += dn

266

			nSrc += sn

267

268

269

	return nDst, nSrc, err

270

271

272

func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {

273

	buf := [8]byte{}

274

	b := strconv.AppendUint(buf[:0], uint64(r), 10)

275

	if n = len(b) + len("&#;"); n >= len(dst) {

276

		return 0, false

277

278

	dst[0] = '&'

279

	dst[1] = '#'

280

	dst[copy(dst[2:], b)+2] = ';'

281

	return n, true

282

283

284

func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {

285

	if len(dst) == 0 {

286

		return 0, false

287

288

	dst[0] = err.Replacement()

289

	return 1, true

290

291

292

// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.

293

var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")

294

295

// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first

296

// input byte that is not valid UTF-8.

297

var UTF8Validator transform.Transformer = utf8Validator{}

298

299

type utf8Validator struct{ transform.NopResetter }

300

301

func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

302

	n := len(src)

303

	if n > len(dst) {

304

		n = len(dst)

305

306

	for i := 0; i < n; {

307

		if c := src[i]; c < utf8.RuneSelf {

308

			dst[i] = c

309

i++

310

			continue

311

312

		_, size := utf8.DecodeRune(src[i:])

313

		if size == 1 {

314

			// All valid runes of size 1 (those below utf8.RuneSelf) were

315

			// handled above. We have invalid UTF-8 or we haven't seen the

316

			// full character yet.

317

			err = ErrInvalidUTF8

318

			if !atEOF && !utf8.FullRune(src[i:]) {

319

				err = transform.ErrShortSrc

320

321

			return i, i, err

322

323

		if i+size > len(dst) {

324

			return i, i, transform.ErrShortDst

325

326

		for ; size > 0; size-- {

327

			dst[i] = src[i]

328

i++

329

330

331

	if len(src) > len(dst) {

332

		err = transform.ErrShortDst

333

334

	return n, n, err

335

1	// Copyright 2013 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Package encoding defines an interface for character encodings, such as Shift
6	// JIS and Windows 1252, that can convert to and from UTF-8.
7	//
8	// Encoding implementations are provided in other packages, such as
9	// golang.org/x/text/encoding/charmap and
10	// golang.org/x/text/encoding/japanese.
11	package encoding // import "golang.org/x/text/encoding"
12
13	import (
14	"errors"
15	"io"
16	"strconv"
17	"unicode/utf8"
18
19	"golang.org/x/text/encoding/internal/identifier"
20	"golang.org/x/text/transform"
21	)
22
23	// TODO:
24	// - There seems to be some inconsistency in when decoders return errors
25	// and when not. Also documentation seems to suggest they shouldn't return
26	// errors at all (except for UTF-16).
27	// - Encoders seem to rely on or at least benefit from the input being in NFC
28	// normal form. Perhaps add an example how users could prepare their output.
29
30	// Encoding is a character set encoding that can be transformed to and from
31	// UTF-8.
32	type Encoding interface {
33	// NewDecoder returns a Decoder.
34	NewDecoder() *Decoder
35
36	// NewEncoder returns an Encoder.
37	NewEncoder() *Encoder
38	}
39
40	// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
41	//
42	// Transforming source bytes that are not of that encoding will not result in an
43	// error per se. Each byte that cannot be transcoded will be represented in the
44	// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
45	type Decoder struct {
46	transform.Transformer
47
48	// This forces external creators of Decoders to use names in struct
49	// initializers, allowing for future extendibility without having to break
50	// code.
51	_ struct{}
52	}
53
54	// Bytes converts the given encoded bytes to UTF-8. It returns the converted
55	// bytes or nil, err if any error occurred.
56	func (d *Decoder) Bytes(b []byte) ([]byte, error) {
57	b, _, err := transform.Bytes(d, b)
58	if err != nil {
59	return nil, err
60	}
61	return b, nil
62	}
63
64	// String converts the given encoded string to UTF-8. It returns the converted
65	// string or "", err if any error occurred.
66	func (d *Decoder) String(s string) (string, error) {
67	s, _, err := transform.String(d, s)
68	if err != nil {
69	return "", err
70	}
71	return s, nil
72	}
73
74	// Reader wraps another Reader to decode its bytes.
75	//
76	// The Decoder may not be used for any other operation as long as the returned
77	// Reader is in use.
78	func (d *Decoder) Reader(r io.Reader) io.Reader {
79	return transform.NewReader(r, d)
80	}
81
82	// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
83	//
84	// Each rune that cannot be transcoded will result in an error. In this case,
85	// the transform will consume all source byte up to, not including the offending
86	// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
87	// `\uFFFD`. To return early with an error instead, use transform.Chain to
88	// preprocess the data with a UTF8Validator.
89	type Encoder struct {
90	transform.Transformer
91
92	// This forces external creators of Encoders to use names in struct
93	// initializers, allowing for future extendibility without having to break
94	// code.
95	_ struct{}
96	}
97
98	// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
99	// any error occurred.
100	func (e *Encoder) Bytes(b []byte) ([]byte, error) {
101	b, _, err := transform.Bytes(e, b)
102	if err != nil {
103	return nil, err
104	}
105	return b, nil
106	}
107
108	// String converts a string from UTF-8. It returns the converted string or
109	// "", err if any error occurred.
110	func (e *Encoder) String(s string) (string, error) {
111	s, _, err := transform.String(e, s)
112	if err != nil {
113	return "", err
114	}
115	return s, nil
116	}
117
118	// Writer wraps another Writer to encode its UTF-8 output.
119	//
120	// The Encoder may not be used for any other operation as long as the returned
121	// Writer is in use.
122	func (e *Encoder) Writer(w io.Writer) io.Writer {
123	return transform.NewWriter(w, e)
124	}
125
126	// ASCIISub is the ASCII substitute character, as recommended by
127	// https://unicode.org/reports/tr36/#Text_Comparison
128	const ASCIISub = '\x1a'
129
130	// Nop is the nop encoding. Its transformed bytes are the same as the source
131	// bytes; it does not replace invalid UTF-8 sequences.
132	var Nop Encoding = nop{}
133
134	type nop struct{}
135
136	func (nop) NewDecoder() *Decoder {
137	return &Decoder{Transformer: transform.Nop}
138	}
139	func (nop) NewEncoder() *Encoder {
140	return &Encoder{Transformer: transform.Nop}
141	}
142
143	// Replacement is the replacement encoding. Decoding from the replacement
144	// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
145	// the replacement encoding yields the same as the source bytes except that
146	// invalid UTF-8 is converted to '\uFFFD'.
147	//
148	// It is defined at http://encoding.spec.whatwg.org/#replacement
149	var Replacement Encoding = replacement{}
150
151	type replacement struct{}
152
153	func (replacement) NewDecoder() *Decoder {
154	return &Decoder{Transformer: replacementDecoder{}}
155	}
156
157	func (replacement) NewEncoder() *Encoder {
158	return &Encoder{Transformer: replacementEncoder{}}
159	}
160
161	func (replacement) ID() (mib identifier.MIB, other string) {
162	return identifier.Replacement, ""
163	}
164
165	type replacementDecoder struct{ transform.NopResetter }
166
167	func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
168	if len(dst) < 3 {
169	return 0, 0, transform.ErrShortDst
170	}
171	if atEOF {
172	const fffd = "\ufffd"
173	dst[0] = fffd[0]
174	dst[1] = fffd[1]
175	dst[2] = fffd[2]
176	nDst = 3
177	}
178	return nDst, len(src), nil
179	}
180
181	type replacementEncoder struct{ transform.NopResetter }
182
183	func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
184	r, size := rune(0), 0
185
186	for ; nSrc < len(src); nSrc += size {
187	r = rune(src[nSrc])
188
189	// Decode a 1-byte rune.
190	if r < utf8.RuneSelf {
191	size = 1
192
193	} else {
194	// Decode a multi-byte rune.
195	r, size = utf8.DecodeRune(src[nSrc:])
196	if size == 1 {
197	// All valid runes of size 1 (those below utf8.RuneSelf) were
198	// handled above. We have invalid UTF-8 or we haven't seen the
199	// full character yet.
200	if !atEOF && !utf8.FullRune(src[nSrc:]) {
201	err = transform.ErrShortSrc
202	break
203	}
204	r = '\ufffd'
205	}
206	}
207
208	if nDst+utf8.RuneLen(r) > len(dst) {
209	err = transform.ErrShortDst
210	break
211	}
212	nDst += utf8.EncodeRune(dst[nDst:], r)
213	}
214	return nDst, nSrc, err
215	}
216
217	// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
218	// repertoire of the destination encoding with HTML escape sequences.
219	//
220	// This wrapper exists to comply to URL and HTML forms requiring a
221	// non-terminating legacy encoder. The produced sequences may lead to data
222	// loss as they are indistinguishable from legitimate input. To avoid this
223	// issue, use UTF-8 encodings whenever possible.
224	func HTMLEscapeUnsupported(e Encoder) Encoder {
225	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
226	}
227
228	// ReplaceUnsupported wraps encoders to replace source runes outside the
229	// repertoire of the destination encoding with an encoding-specific
230	// replacement.
231	//
232	// This wrapper is only provided for backwards compatibility and legacy
233	// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
234	func ReplaceUnsupported(e Encoder) Encoder {
235	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
236	}
237
238	type errorHandler struct {
239	*Encoder
240	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
241	}
242
243	// TODO: consider making this error public in some form.
244	type repertoireError interface {
245	Replacement() byte
246	}
247
248	func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
249	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
250	for err != nil {
251	rerr, ok := err.(repertoireError)
252	if !ok {
253	return nDst, nSrc, err
254	}
255	r, sz := utf8.DecodeRune(src[nSrc:])
256	n, ok := h.handler(dst[nDst:], r, rerr)
257	if !ok {
258	return nDst, nSrc, transform.ErrShortDst
259	}
260	err = nil
261	nDst += n
262	if nSrc += sz; nSrc < len(src) {
263	var dn, sn int
264	dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
265	nDst += dn
266	nSrc += sn
267	}
268	}
269	return nDst, nSrc, err
270	}
271
272	func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
273	buf := [8]byte{}
274	b := strconv.AppendUint(buf[:0], uint64(r), 10)
275	if n = len(b) + len("&#;"); n >= len(dst) {
276	return 0, false
277	}
278	dst[0] = '&'
279	dst[1] = '#'
280	dst[copy(dst[2:], b)+2] = ';'
281	return n, true
282	}
283
284	func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
285	if len(dst) == 0 {
286	return 0, false
287	}
288	dst[0] = err.Replacement()
289	return 1, true
290	}
291
292	// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
293	var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
294
295	// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
296	// input byte that is not valid UTF-8.
297	var UTF8Validator transform.Transformer = utf8Validator{}
298
299	type utf8Validator struct{ transform.NopResetter }
300
301	func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
302	n := len(src)
303	if n > len(dst) {
304	n = len(dst)
305	}
306	for i := 0; i < n; {
307	if c := src[i]; c < utf8.RuneSelf {
308	dst[i] = c
309	i++
310	continue
311	}
312	_, size := utf8.DecodeRune(src[i:])
313	if size == 1 {
314	// All valid runes of size 1 (those below utf8.RuneSelf) were
315	// handled above. We have invalid UTF-8 or we haven't seen the
316	// full character yet.
317	err = ErrInvalidUTF8
318	if !atEOF && !utf8.FullRune(src[i:]) {
319	err = transform.ErrShortSrc
320	}
321	return i, i, err
322	}
323	if i+size > len(dst) {
324	return i, i, transform.ErrShortDst
325	}
326	for ; size > 0; size-- {
327	dst[i] = src[i]
328	i++
329	}
330	}
331	if len(src) > len(dst) {
332	err = transform.ErrShortDst
333	}
334	return n, n, err
335	}