rss-tools: vendor/golang.org/x/net/html/escape.go (master)

1

// Copyright 2010 The Go Authors. All rights reserved.

2

// Use of this source code is governed by a BSD-style

3

// license that can be found in the LICENSE file.

4

5

package html

6

7

import (

8

	"bytes"

9

	"strings"

10

	"unicode/utf8"

11

12

13

// These replacements permit compatibility with old numeric entities that

14

// assumed Windows-1252 encoding.

15

// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference

16

var replacementTable = [...]rune{

17

	'\u20AC', // First entry is what 0x80 should be replaced with.

18

	'\u0081',

19

	'\u201A',

20

	'\u0192',

21

	'\u201E',

22

	'\u2026',

23

	'\u2020',

24

	'\u2021',

25

	'\u02C6',

26

	'\u2030',

27

	'\u0160',

28

	'\u2039',

29

	'\u0152',

30

	'\u008D',

31

	'\u017D',

32

	'\u008F',

33

	'\u0090',

34

	'\u2018',

35

	'\u2019',

36

	'\u201C',

37

	'\u201D',

38

	'\u2022',

39

	'\u2013',

40

	'\u2014',

41

	'\u02DC',

42

	'\u2122',

43

	'\u0161',

44

	'\u203A',

45

	'\u0153',

46

	'\u009D',

47

	'\u017E',

48

	'\u0178', // Last entry is 0x9F.

49

	// 0x00->'\uFFFD' is handled programmatically.

50

	// 0x0D->'\u000D' is a no-op.

51

52

53

// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the

54

// corresponding "<" to b[dst:], returning the incremented dst and src cursors.

55

// Precondition: b[src] == '&' && dst <= src.

56

// attribute should be true if parsing an attribute value.

57

func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {

58

	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference

59

60

	// i starts at 1 because we already know that s[0] == '&'.

61

	i, s := 1, b[src:]

62

63

	if len(s) <= 1 {

64

		b[dst] = b[src]

65

		return dst + 1, src + 1

66

67

68

	if s[i] == '#' {

69

		if len(s) <= 3 { // We need to have at least "&#.".

70

			b[dst] = b[src]

71

			return dst + 1, src + 1

72

73

i++

74

		c := s[i]

75

		hex := false

76

		if c == 'x' || c == 'X' {

77

			hex = true

78

i++

79

80

81

		x := '\x00'

82

		for i < len(s) {

83

			c = s[i]

84

i++

85

			if hex {

86

				if '0' <= c && c <= '9' {

87

					x = 16*x + rune(c) - '0'

88

					continue

89

				} else if 'a' <= c && c <= 'f' {

90

					x = 16*x + rune(c) - 'a' + 10

91

					continue

92

				} else if 'A' <= c && c <= 'F' {

93

					x = 16*x + rune(c) - 'A' + 10

94

					continue

95

96

			} else if '0' <= c && c <= '9' {

97

				x = 10*x + rune(c) - '0'

98

				continue

99

100

			if c != ';' {

101

i--

102

103

			break

104

105

106

		if i <= 3 { // No characters matched.

107

			b[dst] = b[src]

108

			return dst + 1, src + 1

109

110

111

		if 0x80 <= x && x <= 0x9F {

112

			// Replace characters from Windows-1252 with UTF-8 equivalents.

113

			x = replacementTable[x-0x80]

114

		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {

115

			// Replace invalid characters with the replacement character.

116

			x = '\uFFFD'

117

118

119

		return dst + utf8.EncodeRune(b[dst:], x), src + i

120

121

122

	// Consume the maximum number of characters possible, with the

123

	// consumed characters matching one of the named references.

124

125

	for i < len(s) {

126

		c := s[i]

127

i++

128

		// Lower-cased characters are more common in entities, so we check for them first.

129

		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {

130

			continue

131

132

		if c != ';' {

133

i--

134

135

		break

136

137

138

	entityName := string(s[1:i])

139

	if entityName == "" {

140

		// No-op.

141

	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {

142

		// No-op.

143

	} else if x := entity[entityName]; x != 0 {

144

		return dst + utf8.EncodeRune(b[dst:], x), src + i

145

	} else if x := entity2[entityName]; x[0] != 0 {

146

		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])

147

		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i

148

	} else if !attribute {

149

		maxLen := len(entityName) - 1

150

		if maxLen > longestEntityWithoutSemicolon {

151

			maxLen = longestEntityWithoutSemicolon

152

153

		for j := maxLen; j > 1; j-- {

154

			if x := entity[entityName[:j]]; x != 0 {

155

				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1

156

157

158

159

160

	dst1, src1 = dst+i, src+i

161

	copy(b[dst:dst1], b[src:src1])

162

	return dst1, src1

163

164

165

// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".

166

// attribute should be true if parsing an attribute value.

167

func unescape(b []byte, attribute bool) []byte {

168

	for i, c := range b {

169

		if c == '&' {

170

			dst, src := unescapeEntity(b, i, i, attribute)

171

			for src < len(b) {

172

				c := b[src]

173

				if c == '&' {

174

					dst, src = unescapeEntity(b, dst, src, attribute)

175

				} else {

176

					b[dst] = c

177

					dst, src = dst+1, src+1

178

179

180

			return b[0:dst]

181

182

183

	return b

184

185

186

// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".

187

func lower(b []byte) []byte {

188

	for i, c := range b {

189

		if 'A' <= c && c <= 'Z' {

190

			b[i] = c + 'a' - 'A'

191

192

193

	return b

194

195

196

// escapeComment is like func escape but escapes its input bytes less often.

197

// Per https://github.com/golang/go/issues/58246 some HTML comments are (1)

198

// meaningful and (2) contain angle brackets that we'd like to avoid escaping

199

// unless we have to.

200

//

201

// "We have to" includes the '&' byte, since that introduces other escapes.

202

//

203

// It also includes those bytes (not including EOF) that would otherwise end

204

// the comment. Per the summary table at the bottom of comment_test.go, this is

205

// the '>' byte that, per above, we'd like to avoid escaping unless we have to.

206

//

207

// Studying the summary table (and T actions in its '>' column) closely, we

208

// only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the

209

// start of the comment data. State 52 is after a '!'. The other three states

210

// are after a '-'.

211

//

212

// Our algorithm is thus to escape every '&' and to escape '>' if and only if:

213

//   - The '>' is after a '!' or '-' (in the unescaped data) or

214

//   - The '>' is at the start of the comment data (after the opening "<!--").

215

func escapeComment(w writer, s string) error {

216

	// When modifying this function, consider manually increasing the

217

	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.

218

	// That increase should only be temporary, not committed, as it

219

	// exponentially affects the test running time.

220

221

	if len(s) == 0 {

222

		return nil

223

224

225

	// Loop:

226

	//   - Grow j such that s[i:j] does not need escaping.

227

	//   - If s[j] does need escaping, output s[i:j] and an escaped s[j],

228

	//     resetting i and j to point past that s[j] byte.

229

	i := 0

230

	for j := 0; j < len(s); j++ {

231

		escaped := ""

232

		switch s[j] {

233

		case '&':

234

			escaped = "&amp;"

235

236

		case '>':

237

			if j > 0 {

238

				if prev := s[j-1]; (prev != '!') && (prev != '-') {

239

					continue

240

241

242

			escaped = "&gt;"

243

244

		default:

245

			continue

246

247

248

		if i < j {

249

			if _, err := w.WriteString(s[i:j]); err != nil {

250

				return err

251

252

253

		if _, err := w.WriteString(escaped); err != nil {

254

			return err

255

256

		i = j + 1

257

258

259

	if i < len(s) {

260

		if _, err := w.WriteString(s[i:]); err != nil {

261

			return err

262

263

264

	return nil

265

266

267

// escapeCommentString is to EscapeString as escapeComment is to escape.

268

func escapeCommentString(s string) string {

269

	if strings.IndexAny(s, "&>") == -1 {

270

		return s

271

272

	var buf bytes.Buffer

273

	escapeComment(&buf, s)

274

	return buf.String()

275

276

277

const escapedChars = "&'<>\"\r"

278

279

func escape(w writer, s string) error {

280

	i := strings.IndexAny(s, escapedChars)

281

	for i != -1 {

282

		if _, err := w.WriteString(s[:i]); err != nil {

283

			return err

284

285

		var esc string

286

		switch s[i] {

287

		case '&':

288

			esc = "&amp;"

289

		case '\'':

290

			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.

291

			esc = "&#39;"

292

		case '<':

293

			esc = "&lt;"

294

		case '>':

295

			esc = "&gt;"

296

		case '"':

297

			// "&#34;" is shorter than "&quot;".

298

			esc = "&#34;"

299

		case '\r':

300

			esc = "&#13;"

301

		default:

302

			panic("html: unrecognized escape character")

303

304

		s = s[i+1:]

305

		if _, err := w.WriteString(esc); err != nil {

306

			return err

307

308

		i = strings.IndexAny(s, escapedChars)

309

310

	_, err := w.WriteString(s)

311

	return err

312

313

314

// EscapeString escapes special characters like "<" to become "&lt;". It

315

// escapes only five such characters: <, >, &, ' and ".

316

// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't

317

// always true.

318

func EscapeString(s string) string {

319

	if strings.IndexAny(s, escapedChars) == -1 {

320

		return s

321

322

	var buf bytes.Buffer

323

	escape(&buf, s)

324

	return buf.String()

325

326

327

// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a

328

// larger range of entities than EscapeString escapes. For example, "&aacute;"

329

// unescapes to "á", as does "&#225;" and "&xE1;".

330

// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't

331

// always true.

332

func UnescapeString(s string) string {

333

	for _, c := range s {

334

		if c == '&' {

335

			return string(unescape([]byte(s), false))

336

337

338

	return s

339

1	// Copyright 2010 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	package html
6
7	import (
8	"bytes"
9	"strings"
10	"unicode/utf8"
11	)
12
13	// These replacements permit compatibility with old numeric entities that
14	// assumed Windows-1252 encoding.
15	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
16	var replacementTable = [...]rune{
17	'\u20AC', // First entry is what 0x80 should be replaced with.
18	'\u0081',
19	'\u201A',
20	'\u0192',
21	'\u201E',
22	'\u2026',
23	'\u2020',
24	'\u2021',
25	'\u02C6',
26	'\u2030',
27	'\u0160',
28	'\u2039',
29	'\u0152',
30	'\u008D',
31	'\u017D',
32	'\u008F',
33	'\u0090',
34	'\u2018',
35	'\u2019',
36	'\u201C',
37	'\u201D',
38	'\u2022',
39	'\u2013',
40	'\u2014',
41	'\u02DC',
42	'\u2122',
43	'\u0161',
44	'\u203A',
45	'\u0153',
46	'\u009D',
47	'\u017E',
48	'\u0178', // Last entry is 0x9F.
49	// 0x00->'\uFFFD' is handled programmatically.
50	// 0x0D->'\u000D' is a no-op.
51	}
52
53	// unescapeEntity reads an entity like "<" from b[src:] and writes the
54	// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55	// Precondition: b[src] == '&' && dst <= src.
56	// attribute should be true if parsing an attribute value.
57	func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
58	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
59
60	// i starts at 1 because we already know that s[0] == '&'.
61	i, s := 1, b[src:]
62
63	if len(s) <= 1 {
64	b[dst] = b[src]
65	return dst + 1, src + 1
66	}
67
68	if s[i] == '#' {
69	if len(s) <= 3 { // We need to have at least "&#.".
70	b[dst] = b[src]
71	return dst + 1, src + 1
72	}
73	i++
74	c := s[i]
75	hex := false
76	if c == 'x' \|\| c == 'X' {
77	hex = true
78	i++
79	}
80
81	x := '\x00'
82	for i < len(s) {
83	c = s[i]
84	i++
85	if hex {
86	if '0' <= c && c <= '9' {
87	x = 16*x + rune(c) - '0'
88	continue
89	} else if 'a' <= c && c <= 'f' {
90	x = 16*x + rune(c) - 'a' + 10
91	continue
92	} else if 'A' <= c && c <= 'F' {
93	x = 16*x + rune(c) - 'A' + 10
94	continue
95	}
96	} else if '0' <= c && c <= '9' {
97	x = 10*x + rune(c) - '0'
98	continue
99	}
100	if c != ';' {
101	i--
102	}
103	break
104	}
105
106	if i <= 3 { // No characters matched.
107	b[dst] = b[src]
108	return dst + 1, src + 1
109	}
110
111	if 0x80 <= x && x <= 0x9F {
112	// Replace characters from Windows-1252 with UTF-8 equivalents.
113	x = replacementTable[x-0x80]
114	} else if x == 0 \|\| (0xD800 <= x && x <= 0xDFFF) \|\| x > 0x10FFFF {
115	// Replace invalid characters with the replacement character.
116	x = '\uFFFD'
117	}
118
119	return dst + utf8.EncodeRune(b[dst:], x), src + i
120	}
121
122	// Consume the maximum number of characters possible, with the
123	// consumed characters matching one of the named references.
124
125	for i < len(s) {
126	c := s[i]
127	i++
128	// Lower-cased characters are more common in entities, so we check for them first.
129	if 'a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' \|\| '0' <= c && c <= '9' {
130	continue
131	}
132	if c != ';' {
133	i--
134	}
135	break
136	}
137
138	entityName := string(s[1:i])
139	if entityName == "" {
140	// No-op.
141	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
142	// No-op.
143	} else if x := entity[entityName]; x != 0 {
144	return dst + utf8.EncodeRune(b[dst:], x), src + i
145	} else if x := entity2[entityName]; x[0] != 0 {
146	dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
147	return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
148	} else if !attribute {
149	maxLen := len(entityName) - 1
150	if maxLen > longestEntityWithoutSemicolon {
151	maxLen = longestEntityWithoutSemicolon
152	}
153	for j := maxLen; j > 1; j-- {
154	if x := entity[entityName[:j]]; x != 0 {
155	return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
156	}
157	}
158	}
159
160	dst1, src1 = dst+i, src+i
161	copy(b[dst:dst1], b[src:src1])
162	return dst1, src1
163	}
164
165	// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
166	// attribute should be true if parsing an attribute value.
167	func unescape(b []byte, attribute bool) []byte {
168	for i, c := range b {
169	if c == '&' {
170	dst, src := unescapeEntity(b, i, i, attribute)
171	for src < len(b) {
172	c := b[src]
173	if c == '&' {
174	dst, src = unescapeEntity(b, dst, src, attribute)
175	} else {
176	b[dst] = c
177	dst, src = dst+1, src+1
178	}
179	}
180	return b[0:dst]
181	}
182	}
183	return b
184	}
185
186	// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
187	func lower(b []byte) []byte {
188	for i, c := range b {
189	if 'A' <= c && c <= 'Z' {
190	b[i] = c + 'a' - 'A'
191	}
192	}
193	return b
194	}
195
196	// escapeComment is like func escape but escapes its input bytes less often.
197	// Per https://github.com/golang/go/issues/58246 some HTML comments are (1)
198	// meaningful and (2) contain angle brackets that we'd like to avoid escaping
199	// unless we have to.
200	//
201	// "We have to" includes the '&' byte, since that introduces other escapes.
202	//
203	// It also includes those bytes (not including EOF) that would otherwise end
204	// the comment. Per the summary table at the bottom of comment_test.go, this is
205	// the '>' byte that, per above, we'd like to avoid escaping unless we have to.
206	//
207	// Studying the summary table (and T actions in its '>' column) closely, we
208	// only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the
209	// start of the comment data. State 52 is after a '!'. The other three states
210	// are after a '-'.
211	//
212	// Our algorithm is thus to escape every '&' and to escape '>' if and only if:
213	// - The '>' is after a '!' or '-' (in the unescaped data) or
214	// - The '>' is at the start of the comment data (after the opening "<!--").
215	func escapeComment(w writer, s string) error {
216	// When modifying this function, consider manually increasing the
217	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
218	// That increase should only be temporary, not committed, as it
219	// exponentially affects the test running time.
220
221	if len(s) == 0 {
222	return nil
223	}
224
225	// Loop:
226	// - Grow j such that s[i:j] does not need escaping.
227	// - If s[j] does need escaping, output s[i:j] and an escaped s[j],
228	// resetting i and j to point past that s[j] byte.
229	i := 0
230	for j := 0; j < len(s); j++ {
231	escaped := ""
232	switch s[j] {
233	case '&':
234	escaped = "&"
235
236	case '>':
237	if j > 0 {
238	if prev := s[j-1]; (prev != '!') && (prev != '-') {
239	continue
240	}
241	}
242	escaped = ">"
243
244	default:
245	continue
246	}
247
248	if i < j {
249	if _, err := w.WriteString(s[i:j]); err != nil {
250	return err
251	}
252	}
253	if _, err := w.WriteString(escaped); err != nil {
254	return err
255	}
256	i = j + 1
257	}
258
259	if i < len(s) {
260	if _, err := w.WriteString(s[i:]); err != nil {
261	return err
262	}
263	}
264	return nil
265	}
266
267	// escapeCommentString is to EscapeString as escapeComment is to escape.
268	func escapeCommentString(s string) string {
269	if strings.IndexAny(s, "&>") == -1 {
270	return s
271	}
272	var buf bytes.Buffer
273	escapeComment(&buf, s)
274	return buf.String()
275	}
276
277	const escapedChars = "&'<>\"\r"
278
279	func escape(w writer, s string) error {
280	i := strings.IndexAny(s, escapedChars)
281	for i != -1 {
282	if _, err := w.WriteString(s[:i]); err != nil {
283	return err
284	}
285	var esc string
286	switch s[i] {
287	case '&':
288	esc = "&"
289	case '\'':
290	// "'" is shorter than "'" and apos was not in HTML until HTML5.
291	esc = "'"
292	case '<':
293	esc = "<"
294	case '>':
295	esc = ">"
296	case '"':
297	// """ is shorter than """.
298	esc = """
299	case '\r':
300	esc = " "
301	default:
302	panic("html: unrecognized escape character")
303	}
304	s = s[i+1:]
305	if _, err := w.WriteString(esc); err != nil {
306	return err
307	}
308	i = strings.IndexAny(s, escapedChars)
309	}
310	_, err := w.WriteString(s)
311	return err
312	}
313
314	// EscapeString escapes special characters like "<" to become "<". It
315	// escapes only five such characters: <, >, &, ' and ".
316	// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
317	// always true.
318	func EscapeString(s string) string {
319	if strings.IndexAny(s, escapedChars) == -1 {
320	return s
321	}
322	var buf bytes.Buffer
323	escape(&buf, s)
324	return buf.String()
325	}
326
327	// UnescapeString unescapes entities like "<" to become "<". It unescapes a
328	// larger range of entities than EscapeString escapes. For example, "á"
329	// unescapes to "á", as does "á" and "&xE1;".
330	// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
331	// always true.
332	func UnescapeString(s string) string {
333	for _, c := range s {
334	if c == '&' {
335	return string(unescape([]byte(s), false))
336	}
337	}
338	return s
339	}