rss-tools: vendor/golang.org/x/net/html/charset/charset.go (master)

1

// Copyright 2013 The Go Authors. All rights reserved.

2

// Use of this source code is governed by a BSD-style

3

// license that can be found in the LICENSE file.

4

5

// Package charset provides common text encodings for HTML documents.

6

//

7

// The mapping from encoding labels to encodings is defined at

8

// https://encoding.spec.whatwg.org/.

9

package charset // import "golang.org/x/net/html/charset"

10

11

import (

12

	"bytes"

13

	"fmt"

14

	"io"

15

	"mime"

16

	"strings"

17

	"unicode/utf8"

18

19

	"golang.org/x/net/html"

20

	"golang.org/x/text/encoding"

21

	"golang.org/x/text/encoding/charmap"

22

	"golang.org/x/text/encoding/htmlindex"

23

	"golang.org/x/text/transform"

24

25

26

// Lookup returns the encoding with the specified label, and its canonical

27

// name. It returns nil and the empty string if label is not one of the

28

// standard encodings for HTML. Matching is case-insensitive and ignores

29

// leading and trailing whitespace. Encoders will use HTML escape sequences for

30

// runes that are not supported by the character set.

31

func Lookup(label string) (e encoding.Encoding, name string) {

32

	e, err := htmlindex.Get(label)

33

	if err != nil {

34

		return nil, ""

35

36

	name, _ = htmlindex.Name(e)

37

	return &htmlEncoding{e}, name

38

39

40

type htmlEncoding struct{ encoding.Encoding }

41

42

func (h *htmlEncoding) NewEncoder() *encoding.Encoder {

43

	// HTML requires a non-terminating legacy encoder. We use HTML escapes to

44

	// substitute unsupported code points.

45

	return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder())

46

47

48

// DetermineEncoding determines the encoding of an HTML document by examining

49

// up to the first 1024 bytes of content and the declared Content-Type.

50

//

51

// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding

52

func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {

53

	if len(content) > 1024 {

54

		content = content[:1024]

55

56

57

	for _, b := range boms {

58

		if bytes.HasPrefix(content, b.bom) {

59

			e, name = Lookup(b.enc)

60

			return e, name, true

61

62

63

64

	if _, params, err := mime.ParseMediaType(contentType); err == nil {

65

		if cs, ok := params["charset"]; ok {

66

			if e, name = Lookup(cs); e != nil {

67

				return e, name, true

68

69

70

71

72

	if len(content) > 0 {

73

		e, name = prescan(content)

74

		if e != nil {

75

			return e, name, false

76

77

78

79

	// Try to detect UTF-8.

80

	// First eliminate any partial rune at the end.

81

	for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {

82

		b := content[i]

83

		if b < 0x80 {

84

			break

85

86

		if utf8.RuneStart(b) {

87

			content = content[:i]

88

			break

89

90

91

	hasHighBit := false

92

	for _, c := range content {

93

		if c >= 0x80 {

94

			hasHighBit = true

95

			break

96

97

98

	if hasHighBit && utf8.Valid(content) {

99

		return encoding.Nop, "utf-8", false

100

101

102

	// TODO: change default depending on user's locale?

103

	return charmap.Windows1252, "windows-1252", false

104

105

106

// NewReader returns an io.Reader that converts the content of r to UTF-8.

107

// It calls DetermineEncoding to find out what r's encoding is.

108

func NewReader(r io.Reader, contentType string) (io.Reader, error) {

109

	preview := make([]byte, 1024)

110

	n, err := io.ReadFull(r, preview)

111

	switch {

112

	case err == io.ErrUnexpectedEOF:

113

		preview = preview[:n]

114

		r = bytes.NewReader(preview)

115

	case err != nil:

116

		return nil, err

117

	default:

118

		r = io.MultiReader(bytes.NewReader(preview), r)

119

120

121

	if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {

122

		r = transform.NewReader(r, e.NewDecoder())

123

124

	return r, nil

125

126

127

// NewReaderLabel returns a reader that converts from the specified charset to

128

// UTF-8. It uses Lookup to find the encoding that corresponds to label, and

129

// returns an error if Lookup returns nil. It is suitable for use as

130

// encoding/xml.Decoder's CharsetReader function.

131

func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {

132

	e, _ := Lookup(label)

133

	if e == nil {

134

		return nil, fmt.Errorf("unsupported charset: %q", label)

135

136

	return transform.NewReader(input, e.NewDecoder()), nil

137

138

139

func prescan(content []byte) (e encoding.Encoding, name string) {

140

	z := html.NewTokenizer(bytes.NewReader(content))

141

	for {

142

		switch z.Next() {

143

		case html.ErrorToken:

144

			return nil, ""

145

146

		case html.StartTagToken, html.SelfClosingTagToken:

147

			tagName, hasAttr := z.TagName()

148

			if !bytes.Equal(tagName, []byte("meta")) {

149

				continue

150

151

			attrList := make(map[string]bool)

152

			gotPragma := false

153

154

			const (

155

				dontKnow = iota

156

				doNeedPragma

157

				doNotNeedPragma

158

159

			needPragma := dontKnow

160

161

			name = ""

162

			e = nil

163

			for hasAttr {

164

				var key, val []byte

165

				key, val, hasAttr = z.TagAttr()

166

				ks := string(key)

167

				if attrList[ks] {

168

					continue

169

170

				attrList[ks] = true

171

				for i, c := range val {

172

					if 'A' <= c && c <= 'Z' {

173

						val[i] = c + 0x20

174

175

176

177

				switch ks {

178

				case "http-equiv":

179

					if bytes.Equal(val, []byte("content-type")) {

180

						gotPragma = true

181

182

183

				case "content":

184

					if e == nil {

185

						name = fromMetaElement(string(val))

186

						if name != "" {

187

							e, name = Lookup(name)

188

							if e != nil {

189

								needPragma = doNeedPragma

190

191

192

193

194

				case "charset":

195

					e, name = Lookup(string(val))

196

					needPragma = doNotNeedPragma

197

198

199

200

			if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {

201

				continue

202

203

204

			if strings.HasPrefix(name, "utf-16") {

205

				name = "utf-8"

206

				e = encoding.Nop

207

208

209

			if e != nil {

210

				return e, name

211

212

213

214

215

216

func fromMetaElement(s string) string {

217

	for s != "" {

218

		csLoc := strings.Index(s, "charset")

219

		if csLoc == -1 {

220

			return ""

221

222

		s = s[csLoc+len("charset"):]

223

		s = strings.TrimLeft(s, " \t\n\f\r")

224

		if !strings.HasPrefix(s, "=") {

225

			continue

226

227

		s = s[1:]

228

		s = strings.TrimLeft(s, " \t\n\f\r")

229

		if s == "" {

230

			return ""

231

232

		if q := s[0]; q == '"' || q == '\'' {

233

			s = s[1:]

234

			closeQuote := strings.IndexRune(s, rune(q))

235

			if closeQuote == -1 {

236

				return ""

237

238

			return s[:closeQuote]

239

240

241

		end := strings.IndexAny(s, "; \t\n\f\r")

242

		if end == -1 {

243

			end = len(s)

244

245

		return s[:end]

246

247

	return ""

248

249

250

var boms = []struct {

251

	bom []byte

252

	enc string

253

}{

254

	{[]byte{0xfe, 0xff}, "utf-16be"},

255

	{[]byte{0xff, 0xfe}, "utf-16le"},

256

	{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},

257

1	// Copyright 2013 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Package charset provides common text encodings for HTML documents.
6	//
7	// The mapping from encoding labels to encodings is defined at
8	// https://encoding.spec.whatwg.org/.
9	package charset // import "golang.org/x/net/html/charset"
10
11	import (
12	"bytes"
13	"fmt"
14	"io"
15	"mime"
16	"strings"
17	"unicode/utf8"
18
19	"golang.org/x/net/html"
20	"golang.org/x/text/encoding"
21	"golang.org/x/text/encoding/charmap"
22	"golang.org/x/text/encoding/htmlindex"
23	"golang.org/x/text/transform"
24	)
25
26	// Lookup returns the encoding with the specified label, and its canonical
27	// name. It returns nil and the empty string if label is not one of the
28	// standard encodings for HTML. Matching is case-insensitive and ignores
29	// leading and trailing whitespace. Encoders will use HTML escape sequences for
30	// runes that are not supported by the character set.
31	func Lookup(label string) (e encoding.Encoding, name string) {
32	e, err := htmlindex.Get(label)
33	if err != nil {
34	return nil, ""
35	}
36	name, _ = htmlindex.Name(e)
37	return &htmlEncoding{e}, name
38	}
39
40	type htmlEncoding struct{ encoding.Encoding }
41
42	func (h htmlEncoding) NewEncoder() encoding.Encoder {
43	// HTML requires a non-terminating legacy encoder. We use HTML escapes to
44	// substitute unsupported code points.
45	return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder())
46	}
47
48	// DetermineEncoding determines the encoding of an HTML document by examining
49	// up to the first 1024 bytes of content and the declared Content-Type.
50	//
51	// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
52	func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
53	if len(content) > 1024 {
54	content = content[:1024]
55	}
56
57	for _, b := range boms {
58	if bytes.HasPrefix(content, b.bom) {
59	e, name = Lookup(b.enc)
60	return e, name, true
61	}
62	}
63
64	if _, params, err := mime.ParseMediaType(contentType); err == nil {
65	if cs, ok := params["charset"]; ok {
66	if e, name = Lookup(cs); e != nil {
67	return e, name, true
68	}
69	}
70	}
71
72	if len(content) > 0 {
73	e, name = prescan(content)
74	if e != nil {
75	return e, name, false
76	}
77	}
78
79	// Try to detect UTF-8.
80	// First eliminate any partial rune at the end.
81	for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
82	b := content[i]
83	if b < 0x80 {
84	break
85	}
86	if utf8.RuneStart(b) {
87	content = content[:i]
88	break
89	}
90	}
91	hasHighBit := false
92	for _, c := range content {
93	if c >= 0x80 {
94	hasHighBit = true
95	break
96	}
97	}
98	if hasHighBit && utf8.Valid(content) {
99	return encoding.Nop, "utf-8", false
100	}
101
102	// TODO: change default depending on user's locale?
103	return charmap.Windows1252, "windows-1252", false
104	}
105
106	// NewReader returns an io.Reader that converts the content of r to UTF-8.
107	// It calls DetermineEncoding to find out what r's encoding is.
108	func NewReader(r io.Reader, contentType string) (io.Reader, error) {
109	preview := make([]byte, 1024)
110	n, err := io.ReadFull(r, preview)
111	switch {
112	case err == io.ErrUnexpectedEOF:
113	preview = preview[:n]
114	r = bytes.NewReader(preview)
115	case err != nil:
116	return nil, err
117	default:
118	r = io.MultiReader(bytes.NewReader(preview), r)
119	}
120
121	if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
122	r = transform.NewReader(r, e.NewDecoder())
123	}
124	return r, nil
125	}
126
127	// NewReaderLabel returns a reader that converts from the specified charset to
128	// UTF-8. It uses Lookup to find the encoding that corresponds to label, and
129	// returns an error if Lookup returns nil. It is suitable for use as
130	// encoding/xml.Decoder's CharsetReader function.
131	func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
132	e, _ := Lookup(label)
133	if e == nil {
134	return nil, fmt.Errorf("unsupported charset: %q", label)
135	}
136	return transform.NewReader(input, e.NewDecoder()), nil
137	}
138
139	func prescan(content []byte) (e encoding.Encoding, name string) {
140	z := html.NewTokenizer(bytes.NewReader(content))
141	for {
142	switch z.Next() {
143	case html.ErrorToken:
144	return nil, ""
145
146	case html.StartTagToken, html.SelfClosingTagToken:
147	tagName, hasAttr := z.TagName()
148	if !bytes.Equal(tagName, []byte("meta")) {
149	continue
150	}
151	attrList := make(map[string]bool)
152	gotPragma := false
153
154	const (
155	dontKnow = iota
156	doNeedPragma
157	doNotNeedPragma
158	)
159	needPragma := dontKnow
160
161	name = ""
162	e = nil
163	for hasAttr {
164	var key, val []byte
165	key, val, hasAttr = z.TagAttr()
166	ks := string(key)
167	if attrList[ks] {
168	continue
169	}
170	attrList[ks] = true
171	for i, c := range val {
172	if 'A' <= c && c <= 'Z' {
173	val[i] = c + 0x20
174	}
175	}
176
177	switch ks {
178	case "http-equiv":
179	if bytes.Equal(val, []byte("content-type")) {
180	gotPragma = true
181	}
182
183	case "content":
184	if e == nil {
185	name = fromMetaElement(string(val))
186	if name != "" {
187	e, name = Lookup(name)
188	if e != nil {
189	needPragma = doNeedPragma
190	}
191	}
192	}
193
194	case "charset":
195	e, name = Lookup(string(val))
196	needPragma = doNotNeedPragma
197	}
198	}
199
200	if needPragma == dontKnow \|\| needPragma == doNeedPragma && !gotPragma {
201	continue
202	}
203
204	if strings.HasPrefix(name, "utf-16") {
205	name = "utf-8"
206	e = encoding.Nop
207	}
208
209	if e != nil {
210	return e, name
211	}
212	}
213	}
214	}
215
216	func fromMetaElement(s string) string {
217	for s != "" {
218	csLoc := strings.Index(s, "charset")
219	if csLoc == -1 {
220	return ""
221	}
222	s = s[csLoc+len("charset"):]
223	s = strings.TrimLeft(s, " \t\n\f\r")
224	if !strings.HasPrefix(s, "=") {
225	continue
226	}
227	s = s[1:]
228	s = strings.TrimLeft(s, " \t\n\f\r")
229	if s == "" {
230	return ""
231	}
232	if q := s[0]; q == '"' \|\| q == '\'' {
233	s = s[1:]
234	closeQuote := strings.IndexRune(s, rune(q))
235	if closeQuote == -1 {
236	return ""
237	}
238	return s[:closeQuote]
239	}
240
241	end := strings.IndexAny(s, "; \t\n\f\r")
242	if end == -1 {
243	end = len(s)
244	}
245	return s[:end]
246	}
247	return ""
248	}
249
250	var boms = []struct {
251	bom []byte
252	enc string
253	}{
254	{[]byte{0xfe, 0xff}, "utf-16be"},
255	{[]byte{0xff, 0xfe}, "utf-16le"},
256	{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
257	}