all repos

rss-tools @ master

get rss feed from sources that(i need and) dont provide one

rss-tools/vendor/golang.org/x/text/encoding/encoding.go (view raw)

Oleksandr Smirnov Oleksandr Smirnov
olexsmir@gmail.com
we're vendoring now, 7 days ago
1
// Copyright 2013 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Package encoding defines an interface for character encodings, such as Shift
6
// JIS and Windows 1252, that can convert to and from UTF-8.
7
//
8
// Encoding implementations are provided in other packages, such as
9
// golang.org/x/text/encoding/charmap and
10
// golang.org/x/text/encoding/japanese.
11
package encoding // import "golang.org/x/text/encoding"
12
13
import (
14
	"errors"
15
	"io"
16
	"strconv"
17
	"unicode/utf8"
18
19
	"golang.org/x/text/encoding/internal/identifier"
20
	"golang.org/x/text/transform"
21
)
22
23
// TODO:
24
// - There seems to be some inconsistency in when decoders return errors
25
//   and when not. Also documentation seems to suggest they shouldn't return
26
//   errors at all (except for UTF-16).
27
// - Encoders seem to rely on or at least benefit from the input being in NFC
28
//   normal form. Perhaps add an example how users could prepare their output.
29
30
// Encoding is a character set encoding that can be transformed to and from
31
// UTF-8.
32
type Encoding interface {
33
	// NewDecoder returns a Decoder.
34
	NewDecoder() *Decoder
35
36
	// NewEncoder returns an Encoder.
37
	NewEncoder() *Encoder
38
}
39
40
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
41
//
42
// Transforming source bytes that are not of that encoding will not result in an
43
// error per se. Each byte that cannot be transcoded will be represented in the
44
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
45
type Decoder struct {
46
	transform.Transformer
47
48
	// This forces external creators of Decoders to use names in struct
49
	// initializers, allowing for future extendibility without having to break
50
	// code.
51
	_ struct{}
52
}
53
54
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
55
// bytes or nil, err if any error occurred.
56
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
57
	b, _, err := transform.Bytes(d, b)
58
	if err != nil {
59
		return nil, err
60
	}
61
	return b, nil
62
}
63
64
// String converts the given encoded string to UTF-8. It returns the converted
65
// string or "", err if any error occurred.
66
func (d *Decoder) String(s string) (string, error) {
67
	s, _, err := transform.String(d, s)
68
	if err != nil {
69
		return "", err
70
	}
71
	return s, nil
72
}
73
74
// Reader wraps another Reader to decode its bytes.
75
//
76
// The Decoder may not be used for any other operation as long as the returned
77
// Reader is in use.
78
func (d *Decoder) Reader(r io.Reader) io.Reader {
79
	return transform.NewReader(r, d)
80
}
81
82
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
83
//
84
// Each rune that cannot be transcoded will result in an error. In this case,
85
// the transform will consume all source byte up to, not including the offending
86
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
87
// `\uFFFD`. To return early with an error instead, use transform.Chain to
88
// preprocess the data with a UTF8Validator.
89
type Encoder struct {
90
	transform.Transformer
91
92
	// This forces external creators of Encoders to use names in struct
93
	// initializers, allowing for future extendibility without having to break
94
	// code.
95
	_ struct{}
96
}
97
98
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
99
// any error occurred.
100
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
101
	b, _, err := transform.Bytes(e, b)
102
	if err != nil {
103
		return nil, err
104
	}
105
	return b, nil
106
}
107
108
// String converts a string from UTF-8. It returns the converted string or
109
// "", err if any error occurred.
110
func (e *Encoder) String(s string) (string, error) {
111
	s, _, err := transform.String(e, s)
112
	if err != nil {
113
		return "", err
114
	}
115
	return s, nil
116
}
117
118
// Writer wraps another Writer to encode its UTF-8 output.
119
//
120
// The Encoder may not be used for any other operation as long as the returned
121
// Writer is in use.
122
func (e *Encoder) Writer(w io.Writer) io.Writer {
123
	return transform.NewWriter(w, e)
124
}
125
126
// ASCIISub is the ASCII substitute character, as recommended by
127
// https://unicode.org/reports/tr36/#Text_Comparison
128
const ASCIISub = '\x1a'
129
130
// Nop is the nop encoding. Its transformed bytes are the same as the source
131
// bytes; it does not replace invalid UTF-8 sequences.
132
var Nop Encoding = nop{}
133
134
type nop struct{}
135
136
func (nop) NewDecoder() *Decoder {
137
	return &Decoder{Transformer: transform.Nop}
138
}
139
func (nop) NewEncoder() *Encoder {
140
	return &Encoder{Transformer: transform.Nop}
141
}
142
143
// Replacement is the replacement encoding. Decoding from the replacement
144
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
145
// the replacement encoding yields the same as the source bytes except that
146
// invalid UTF-8 is converted to '\uFFFD'.
147
//
148
// It is defined at http://encoding.spec.whatwg.org/#replacement
149
var Replacement Encoding = replacement{}
150
151
type replacement struct{}
152
153
func (replacement) NewDecoder() *Decoder {
154
	return &Decoder{Transformer: replacementDecoder{}}
155
}
156
157
func (replacement) NewEncoder() *Encoder {
158
	return &Encoder{Transformer: replacementEncoder{}}
159
}
160
161
func (replacement) ID() (mib identifier.MIB, other string) {
162
	return identifier.Replacement, ""
163
}
164
165
type replacementDecoder struct{ transform.NopResetter }
166
167
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
168
	if len(dst) < 3 {
169
		return 0, 0, transform.ErrShortDst
170
	}
171
	if atEOF {
172
		const fffd = "\ufffd"
173
		dst[0] = fffd[0]
174
		dst[1] = fffd[1]
175
		dst[2] = fffd[2]
176
		nDst = 3
177
	}
178
	return nDst, len(src), nil
179
}
180
181
type replacementEncoder struct{ transform.NopResetter }
182
183
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
184
	r, size := rune(0), 0
185
186
	for ; nSrc < len(src); nSrc += size {
187
		r = rune(src[nSrc])
188
189
		// Decode a 1-byte rune.
190
		if r < utf8.RuneSelf {
191
			size = 1
192
193
		} else {
194
			// Decode a multi-byte rune.
195
			r, size = utf8.DecodeRune(src[nSrc:])
196
			if size == 1 {
197
				// All valid runes of size 1 (those below utf8.RuneSelf) were
198
				// handled above. We have invalid UTF-8 or we haven't seen the
199
				// full character yet.
200
				if !atEOF && !utf8.FullRune(src[nSrc:]) {
201
					err = transform.ErrShortSrc
202
					break
203
				}
204
				r = '\ufffd'
205
			}
206
		}
207
208
		if nDst+utf8.RuneLen(r) > len(dst) {
209
			err = transform.ErrShortDst
210
			break
211
		}
212
		nDst += utf8.EncodeRune(dst[nDst:], r)
213
	}
214
	return nDst, nSrc, err
215
}
216
217
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
218
// repertoire of the destination encoding with HTML escape sequences.
219
//
220
// This wrapper exists to comply to URL and HTML forms requiring a
221
// non-terminating legacy encoder. The produced sequences may lead to data
222
// loss as they are indistinguishable from legitimate input. To avoid this
223
// issue, use UTF-8 encodings whenever possible.
224
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
225
	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
226
}
227
228
// ReplaceUnsupported wraps encoders to replace source runes outside the
229
// repertoire of the destination encoding with an encoding-specific
230
// replacement.
231
//
232
// This wrapper is only provided for backwards compatibility and legacy
233
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
234
func ReplaceUnsupported(e *Encoder) *Encoder {
235
	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
236
}
237
238
type errorHandler struct {
239
	*Encoder
240
	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
241
}
242
243
// TODO: consider making this error public in some form.
244
type repertoireError interface {
245
	Replacement() byte
246
}
247
248
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
249
	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
250
	for err != nil {
251
		rerr, ok := err.(repertoireError)
252
		if !ok {
253
			return nDst, nSrc, err
254
		}
255
		r, sz := utf8.DecodeRune(src[nSrc:])
256
		n, ok := h.handler(dst[nDst:], r, rerr)
257
		if !ok {
258
			return nDst, nSrc, transform.ErrShortDst
259
		}
260
		err = nil
261
		nDst += n
262
		if nSrc += sz; nSrc < len(src) {
263
			var dn, sn int
264
			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
265
			nDst += dn
266
			nSrc += sn
267
		}
268
	}
269
	return nDst, nSrc, err
270
}
271
272
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
273
	buf := [8]byte{}
274
	b := strconv.AppendUint(buf[:0], uint64(r), 10)
275
	if n = len(b) + len("&#;"); n >= len(dst) {
276
		return 0, false
277
	}
278
	dst[0] = '&'
279
	dst[1] = '#'
280
	dst[copy(dst[2:], b)+2] = ';'
281
	return n, true
282
}
283
284
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
285
	if len(dst) == 0 {
286
		return 0, false
287
	}
288
	dst[0] = err.Replacement()
289
	return 1, true
290
}
291
292
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
293
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
294
295
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
296
// input byte that is not valid UTF-8.
297
var UTF8Validator transform.Transformer = utf8Validator{}
298
299
type utf8Validator struct{ transform.NopResetter }
300
301
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
302
	n := len(src)
303
	if n > len(dst) {
304
		n = len(dst)
305
	}
306
	for i := 0; i < n; {
307
		if c := src[i]; c < utf8.RuneSelf {
308
			dst[i] = c
309
			i++
310
			continue
311
		}
312
		_, size := utf8.DecodeRune(src[i:])
313
		if size == 1 {
314
			// All valid runes of size 1 (those below utf8.RuneSelf) were
315
			// handled above. We have invalid UTF-8 or we haven't seen the
316
			// full character yet.
317
			err = ErrInvalidUTF8
318
			if !atEOF && !utf8.FullRune(src[i:]) {
319
				err = transform.ErrShortSrc
320
			}
321
			return i, i, err
322
		}
323
		if i+size > len(dst) {
324
			return i, i, transform.ErrShortDst
325
		}
326
		for ; size > 0; size-- {
327
			dst[i] = src[i]
328
			i++
329
		}
330
	}
331
	if len(src) > len(dst) {
332
		err = transform.ErrShortDst
333
	}
334
	return n, n, err
335
}