all repos

rss-tools @ a5ac527

get rss feed from sources that(i need and) dont provide one

rss-tools/vendor/golang.org/x/text/encoding/simplifiedchinese/gbk.go (view raw)

Oleksandr Smirnov Oleksandr Smirnov
olexsmir@gmail.com
we're vendoring now, 7 days ago
1
// Copyright 2013 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
package simplifiedchinese
6
7
import (
8
	"unicode/utf8"
9
10
	"golang.org/x/text/encoding"
11
	"golang.org/x/text/encoding/internal"
12
	"golang.org/x/text/encoding/internal/identifier"
13
	"golang.org/x/text/transform"
14
)
15
16
var (
17
	// GB18030 is the GB18030 encoding.
18
	GB18030 encoding.Encoding = &gbk18030
19
	// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
20
	// and is also known as Code Page 936.
21
	GBK encoding.Encoding = &gbk
22
)
23
24
var gbk = internal.Encoding{
25
	Encoding: &internal.SimpleEncoding{
26
		Decoder: gbkDecoder{gb18030: false},
27
		Encoder: gbkEncoder{gb18030: false},
28
	},
29
	Name: "GBK",
30
	MIB:  identifier.GBK,
31
}
32
33
var gbk18030 = internal.Encoding{
34
	Encoding: &internal.SimpleEncoding{
35
		Decoder: gbkDecoder{gb18030: true},
36
		Encoder: gbkEncoder{gb18030: true},
37
	},
38
	Name: "GB18030",
39
	MIB:  identifier.GB18030,
40
}
41
42
type gbkDecoder struct {
43
	transform.NopResetter
44
	gb18030 bool
45
}
46
47
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
48
	r, size := rune(0), 0
49
loop:
50
	for ; nSrc < len(src); nSrc += size {
51
		switch c0 := src[nSrc]; {
52
		case c0 < utf8.RuneSelf:
53
			r, size = rune(c0), 1
54
55
		// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
56
		// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
57
		// says to treat "gbk" as Code Page 936.
58
		// GBK’s decoder is gb18030’s decoder. https://encoding.spec.whatwg.org/#gbk-decoder
59
		// If byte is 0x80, return code point U+20AC. https://encoding.spec.whatwg.org/#gb18030-decoder
60
		case c0 == 0x80:
61
			r, size = '€', 1
62
63
		case c0 < 0xff:
64
			if nSrc+1 >= len(src) {
65
				if !atEOF {
66
					err = transform.ErrShortSrc
67
					break loop
68
				}
69
				r, size = utf8.RuneError, 1
70
				goto write
71
			}
72
			c1 := src[nSrc+1]
73
			switch {
74
			case 0x40 <= c1 && c1 < 0x7f:
75
				c1 -= 0x40
76
			case 0x80 <= c1 && c1 < 0xff:
77
				c1 -= 0x41
78
			case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
79
				if nSrc+3 >= len(src) {
80
					if !atEOF {
81
						err = transform.ErrShortSrc
82
						break loop
83
					}
84
					// The second byte here is always ASCII, so we can set size
85
					// to 1 in all cases.
86
					r, size = utf8.RuneError, 1
87
					goto write
88
				}
89
				c2 := src[nSrc+2]
90
				if c2 < 0x81 || 0xff <= c2 {
91
					r, size = utf8.RuneError, 1
92
					goto write
93
				}
94
				c3 := src[nSrc+3]
95
				if c3 < 0x30 || 0x3a <= c3 {
96
					r, size = utf8.RuneError, 1
97
					goto write
98
				}
99
				size = 4
100
				r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
101
				if r < 39420 {
102
					i, j := 0, len(gb18030)
103
					for i < j {
104
						h := i + (j-i)/2
105
						if r >= rune(gb18030[h][0]) {
106
							i = h + 1
107
						} else {
108
							j = h
109
						}
110
					}
111
					dec := &gb18030[i-1]
112
					r += rune(dec[1]) - rune(dec[0])
113
					goto write
114
				}
115
				r -= 189000
116
				if 0 <= r && r < 0x100000 {
117
					r += 0x10000
118
				} else {
119
					r, size = utf8.RuneError, 1
120
				}
121
				goto write
122
			default:
123
				r, size = utf8.RuneError, 1
124
				goto write
125
			}
126
			r, size = '\ufffd', 2
127
			if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
128
				r = rune(decode[i])
129
				if r == 0 {
130
					r = '\ufffd'
131
				}
132
			}
133
134
		default:
135
			r, size = utf8.RuneError, 1
136
		}
137
138
	write:
139
		if nDst+utf8.RuneLen(r) > len(dst) {
140
			err = transform.ErrShortDst
141
			break loop
142
		}
143
		nDst += utf8.EncodeRune(dst[nDst:], r)
144
	}
145
	return nDst, nSrc, err
146
}
147
148
type gbkEncoder struct {
149
	transform.NopResetter
150
	gb18030 bool
151
}
152
153
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
154
	r, r2, size := rune(0), rune(0), 0
155
	for ; nSrc < len(src); nSrc += size {
156
		r = rune(src[nSrc])
157
158
		// Decode a 1-byte rune.
159
		if r < utf8.RuneSelf {
160
			size = 1
161
162
		} else {
163
			// Decode a multi-byte rune.
164
			r, size = utf8.DecodeRune(src[nSrc:])
165
			if size == 1 {
166
				// All valid runes of size 1 (those below utf8.RuneSelf) were
167
				// handled above. We have invalid UTF-8 or we haven't seen the
168
				// full character yet.
169
				if !atEOF && !utf8.FullRune(src[nSrc:]) {
170
					err = transform.ErrShortSrc
171
					break
172
				}
173
			}
174
175
			// func init checks that the switch covers all tables.
176
			switch {
177
			case encode0Low <= r && r < encode0High:
178
				if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
179
					goto write2
180
				}
181
			case encode1Low <= r && r < encode1High:
182
				// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
183
				// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
184
				// says to treat "gbk" as Code Page 936.
185
				// GBK’s encoder is gb18030’s encoder with its _is GBK_ set to true. https://encoding.spec.whatwg.org/#gbk-encoder
186
				// If _is GBK_ is true and code point is U+20AC, return byte 0x80. https://encoding.spec.whatwg.org/#gb18030-encoder
187
				if !e.gb18030 && r == '€' {
188
					r = 0x80
189
					goto write1
190
				}
191
				if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
192
					goto write2
193
				}
194
			case encode2Low <= r && r < encode2High:
195
				if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
196
					goto write2
197
				}
198
			case encode3Low <= r && r < encode3High:
199
				if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
200
					goto write2
201
				}
202
			case encode4Low <= r && r < encode4High:
203
				if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
204
					goto write2
205
				}
206
			}
207
208
			if e.gb18030 {
209
				if r < 0x10000 {
210
					i, j := 0, len(gb18030)
211
					for i < j {
212
						h := i + (j-i)/2
213
						if r >= rune(gb18030[h][1]) {
214
							i = h + 1
215
						} else {
216
							j = h
217
						}
218
					}
219
					dec := &gb18030[i-1]
220
					r += rune(dec[0]) - rune(dec[1])
221
					goto write4
222
				} else if r < 0x110000 {
223
					r += 189000 - 0x10000
224
					goto write4
225
				}
226
			}
227
			err = internal.ErrASCIIReplacement
228
			break
229
		}
230
231
	write1:
232
		if nDst >= len(dst) {
233
			err = transform.ErrShortDst
234
			break
235
		}
236
		dst[nDst] = uint8(r)
237
		nDst++
238
		continue
239
240
	write2:
241
		if nDst+2 > len(dst) {
242
			err = transform.ErrShortDst
243
			break
244
		}
245
		dst[nDst+0] = uint8(r2 >> 8)
246
		dst[nDst+1] = uint8(r2)
247
		nDst += 2
248
		continue
249
250
	write4:
251
		if nDst+4 > len(dst) {
252
			err = transform.ErrShortDst
253
			break
254
		}
255
		dst[nDst+3] = uint8(r%10 + 0x30)
256
		r /= 10
257
		dst[nDst+2] = uint8(r%126 + 0x81)
258
		r /= 126
259
		dst[nDst+1] = uint8(r%10 + 0x30)
260
		r /= 10
261
		dst[nDst+0] = uint8(r + 0x81)
262
		nDst += 4
263
		continue
264
	}
265
	return nDst, nSrc, err
266
}
267
268
func init() {
269
	// Check that the hard-coded encode switch covers all tables.
270
	if numEncodeTables != 5 {
271
		panic("bad numEncodeTables")
272
	}
273
}