rss-tools/vendor/golang.org/x/text/encoding/unicode/unicode.go (view raw)
| 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Package unicode provides Unicode encodings such as UTF-16. |
| 6 | package unicode // import "golang.org/x/text/encoding/unicode" |
| 7 | |
| 8 | import ( |
| 9 | "bytes" |
| 10 | "errors" |
| 11 | "unicode/utf16" |
| 12 | "unicode/utf8" |
| 13 | |
| 14 | "golang.org/x/text/encoding" |
| 15 | "golang.org/x/text/encoding/internal" |
| 16 | "golang.org/x/text/encoding/internal/identifier" |
| 17 | "golang.org/x/text/internal/utf8internal" |
| 18 | "golang.org/x/text/runes" |
| 19 | "golang.org/x/text/transform" |
| 20 | ) |
| 21 | |
| 22 | // TODO: I think the Transformers really should return errors on unmatched |
| 23 | // surrogate pairs and odd numbers of bytes. This is not required by RFC 2781, |
| 24 | // which leaves it open, but is suggested by WhatWG. It will allow for all error |
| 25 | // modes as defined by WhatWG: fatal, HTML and Replacement. This would require |
| 26 | // the introduction of some kind of error type for conveying the erroneous code |
| 27 | // point. |
| 28 | |
| 29 | // UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks. |
| 30 | var UTF8 encoding.Encoding = utf8enc |
| 31 | |
| 32 | // UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order |
| 33 | // mark while the encoder adds one. |
| 34 | // |
| 35 | // Some editors add a byte order mark as a signature to UTF-8 files. Although |
| 36 | // the byte order mark is not useful for detecting byte order in UTF-8, it is |
| 37 | // sometimes used as a convention to mark UTF-8-encoded files. This relies on |
| 38 | // the observation that the UTF-8 byte order mark is either an illegal or at |
| 39 | // least very unlikely sequence in any other character encoding. |
| 40 | var UTF8BOM encoding.Encoding = utf8bomEncoding{} |
| 41 | |
| 42 | type utf8bomEncoding struct{} |
| 43 | |
| 44 | func (utf8bomEncoding) String() string { |
| 45 | return "UTF-8-BOM" |
| 46 | } |
| 47 | |
| 48 | func (utf8bomEncoding) ID() (identifier.MIB, string) { |
| 49 | return identifier.Unofficial, "x-utf8bom" |
| 50 | } |
| 51 | |
| 52 | func (utf8bomEncoding) NewEncoder() *encoding.Encoder { |
| 53 | return &encoding.Encoder{ |
| 54 | Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()}, |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | func (utf8bomEncoding) NewDecoder() *encoding.Decoder { |
| 59 | return &encoding.Decoder{Transformer: &utf8bomDecoder{}} |
| 60 | } |
| 61 | |
| 62 | var utf8enc = &internal.Encoding{ |
| 63 | Encoding: &internal.SimpleEncoding{Decoder: utf8Decoder{}, Encoder: runes.ReplaceIllFormed()}, |
| 64 | Name: "UTF-8", |
| 65 | MIB: identifier.UTF8, |
| 66 | } |
| 67 | |
| 68 | type utf8bomDecoder struct { |
| 69 | checked bool |
| 70 | } |
| 71 | |
| 72 | func (t *utf8bomDecoder) Reset() { |
| 73 | t.checked = false |
| 74 | } |
| 75 | |
| 76 | func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 77 | if !t.checked { |
| 78 | if !atEOF && len(src) < len(utf8BOM) { |
| 79 | if len(src) == 0 { |
| 80 | return 0, 0, nil |
| 81 | } |
| 82 | return 0, 0, transform.ErrShortSrc |
| 83 | } |
| 84 | if bytes.HasPrefix(src, []byte(utf8BOM)) { |
| 85 | nSrc += len(utf8BOM) |
| 86 | src = src[len(utf8BOM):] |
| 87 | } |
| 88 | t.checked = true |
| 89 | } |
| 90 | nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF) |
| 91 | nSrc += n |
| 92 | return nDst, nSrc, err |
| 93 | } |
| 94 | |
| 95 | type utf8bomEncoder struct { |
| 96 | written bool |
| 97 | t transform.Transformer |
| 98 | } |
| 99 | |
| 100 | func (t *utf8bomEncoder) Reset() { |
| 101 | t.written = false |
| 102 | t.t.Reset() |
| 103 | } |
| 104 | |
| 105 | func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 106 | if !t.written { |
| 107 | if len(dst) < len(utf8BOM) { |
| 108 | return nDst, 0, transform.ErrShortDst |
| 109 | } |
| 110 | nDst = copy(dst, utf8BOM) |
| 111 | t.written = true |
| 112 | } |
| 113 | n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF) |
| 114 | nDst += n |
| 115 | return nDst, nSrc, err |
| 116 | } |
| 117 | |
| 118 | type utf8Decoder struct{ transform.NopResetter } |
| 119 | |
| 120 | func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 121 | var pSrc int // point from which to start copy in src |
| 122 | var accept utf8internal.AcceptRange |
| 123 | |
| 124 | // The decoder can only make the input larger, not smaller. |
| 125 | n := len(src) |
| 126 | if len(dst) < n { |
| 127 | err = transform.ErrShortDst |
| 128 | n = len(dst) |
| 129 | atEOF = false |
| 130 | } |
| 131 | for nSrc < n { |
| 132 | c := src[nSrc] |
| 133 | if c < utf8.RuneSelf { |
| 134 | nSrc++ |
| 135 | continue |
| 136 | } |
| 137 | first := utf8internal.First[c] |
| 138 | size := int(first & utf8internal.SizeMask) |
| 139 | if first == utf8internal.FirstInvalid { |
| 140 | goto handleInvalid // invalid starter byte |
| 141 | } |
| 142 | accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift] |
| 143 | if nSrc+size > n { |
| 144 | if !atEOF { |
| 145 | // We may stop earlier than necessary here if the short sequence |
| 146 | // has invalid bytes. Not checking for this simplifies the code |
| 147 | // and may avoid duplicate computations in certain conditions. |
| 148 | if err == nil { |
| 149 | err = transform.ErrShortSrc |
| 150 | } |
| 151 | break |
| 152 | } |
| 153 | // Determine the maximal subpart of an ill-formed subsequence. |
| 154 | switch { |
| 155 | case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]: |
| 156 | size = 1 |
| 157 | case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]: |
| 158 | size = 2 |
| 159 | default: |
| 160 | size = 3 // As we are short, the maximum is 3. |
| 161 | } |
| 162 | goto handleInvalid |
| 163 | } |
| 164 | if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c { |
| 165 | size = 1 |
| 166 | goto handleInvalid // invalid continuation byte |
| 167 | } else if size == 2 { |
| 168 | } else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c { |
| 169 | size = 2 |
| 170 | goto handleInvalid // invalid continuation byte |
| 171 | } else if size == 3 { |
| 172 | } else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c { |
| 173 | size = 3 |
| 174 | goto handleInvalid // invalid continuation byte |
| 175 | } |
| 176 | nSrc += size |
| 177 | continue |
| 178 | |
| 179 | handleInvalid: |
| 180 | // Copy the scanned input so far. |
| 181 | nDst += copy(dst[nDst:], src[pSrc:nSrc]) |
| 182 | |
| 183 | // Append RuneError to the destination. |
| 184 | const runeError = "\ufffd" |
| 185 | if nDst+len(runeError) > len(dst) { |
| 186 | return nDst, nSrc, transform.ErrShortDst |
| 187 | } |
| 188 | nDst += copy(dst[nDst:], runeError) |
| 189 | |
| 190 | // Skip the maximal subpart of an ill-formed subsequence according to |
| 191 | // the W3C standard way instead of the Go way. This Transform is |
| 192 | // probably the only place in the text repo where it is warranted. |
| 193 | nSrc += size |
| 194 | pSrc = nSrc |
| 195 | |
| 196 | // Recompute the maximum source length. |
| 197 | if sz := len(dst) - nDst; sz < len(src)-nSrc { |
| 198 | err = transform.ErrShortDst |
| 199 | n = nSrc + sz |
| 200 | atEOF = false |
| 201 | } |
| 202 | } |
| 203 | return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err |
| 204 | } |
| 205 | |
| 206 | // UTF16 returns a UTF-16 Encoding for the given default endianness and byte |
| 207 | // order mark (BOM) policy. |
| 208 | // |
| 209 | // When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then |
| 210 | // neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect |
| 211 | // the endianness used for decoding, and will instead be output as their |
| 212 | // standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy |
| 213 | // is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output. |
| 214 | // Instead, it overrides the default endianness e for the remainder of the |
| 215 | // transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not |
| 216 | // affect the endianness used, and will instead be output as their standard |
| 217 | // UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed |
| 218 | // with the default Endianness. For ExpectBOM, in that case, the transformation |
| 219 | // will return early with an ErrMissingBOM error. |
| 220 | // |
| 221 | // When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of |
| 222 | // the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not |
| 223 | // be inserted. The UTF-8 input does not need to contain a BOM. |
| 224 | // |
| 225 | // There is no concept of a 'native' endianness. If the UTF-16 data is produced |
| 226 | // and consumed in a greater context that implies a certain endianness, use |
| 227 | // IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM. |
| 228 | // |
| 229 | // In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM |
| 230 | // corresponds to "Where the precise type of the data stream is known... the |
| 231 | // BOM should not be used" and ExpectBOM corresponds to "A particular |
| 232 | // protocol... may require use of the BOM". |
| 233 | func UTF16(e Endianness, b BOMPolicy) encoding.Encoding { |
| 234 | return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]} |
| 235 | } |
| 236 | |
| 237 | // mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that |
| 238 | // some configurations map to the same MIB identifier. RFC 2781 has requirements |
| 239 | // and recommendations. Some of the "configurations" are merely recommendations, |
| 240 | // so multiple configurations could match. |
| 241 | var mibValue = map[Endianness][numBOMValues]identifier.MIB{ |
| 242 | BigEndian: [numBOMValues]identifier.MIB{ |
| 243 | IgnoreBOM: identifier.UTF16BE, |
| 244 | UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781. |
| 245 | // TODO: acceptBOM | strictBOM would map to UTF16BE as well. |
| 246 | }, |
| 247 | LittleEndian: [numBOMValues]identifier.MIB{ |
| 248 | IgnoreBOM: identifier.UTF16LE, |
| 249 | UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows. |
| 250 | // TODO: acceptBOM | strictBOM would map to UTF16LE as well. |
| 251 | }, |
| 252 | // ExpectBOM is not widely used and has no valid MIB identifier. |
| 253 | } |
| 254 | |
| 255 | // All lists a configuration for each IANA-defined UTF-16 variant. |
| 256 | var All = []encoding.Encoding{ |
| 257 | UTF8, |
| 258 | UTF16(BigEndian, UseBOM), |
| 259 | UTF16(BigEndian, IgnoreBOM), |
| 260 | UTF16(LittleEndian, IgnoreBOM), |
| 261 | } |
| 262 | |
| 263 | // BOMPolicy is a UTF-16 encoding's byte order mark policy. |
| 264 | type BOMPolicy uint8 |
| 265 | |
| 266 | const ( |
| 267 | writeBOM BOMPolicy = 0x01 |
| 268 | acceptBOM BOMPolicy = 0x02 |
| 269 | requireBOM BOMPolicy = 0x04 |
| 270 | bomMask BOMPolicy = 0x07 |
| 271 | |
| 272 | // HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a |
| 273 | // map of an array of length 8 of a type that is also used as a key or value |
| 274 | // in another map). See golang.org/issue/11354. |
| 275 | // TODO: consider changing this value back to 8 if the use of 1.4.* has |
| 276 | // been minimized. |
| 277 | numBOMValues = 8 + 1 |
| 278 | |
| 279 | // IgnoreBOM means to ignore any byte order marks. |
| 280 | IgnoreBOM BOMPolicy = 0 |
| 281 | // Common and RFC 2781-compliant interpretation for UTF-16BE/LE. |
| 282 | |
| 283 | // UseBOM means that the UTF-16 form may start with a byte order mark, which |
| 284 | // will be used to override the default encoding. |
| 285 | UseBOM BOMPolicy = writeBOM | acceptBOM |
| 286 | // Common and RFC 2781-compliant interpretation for UTF-16. |
| 287 | |
| 288 | // ExpectBOM means that the UTF-16 form must start with a byte order mark, |
| 289 | // which will be used to override the default encoding. |
| 290 | ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM |
| 291 | // Used in Java as Unicode (not to be confused with Java's UTF-16) and |
| 292 | // ICU's UTF-16,version=1. Not compliant with RFC 2781. |
| 293 | |
| 294 | // TODO (maybe): strictBOM: BOM must match Endianness. This would allow: |
| 295 | // - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM |
| 296 | // (UnicodeBig and UnicodeLittle in Java) |
| 297 | // - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E: |
| 298 | // acceptBOM | strictBOM (e.g. assigned to CheckBOM). |
| 299 | // This addition would be consistent with supporting ExpectBOM. |
| 300 | ) |
| 301 | |
| 302 | // Endianness is a UTF-16 encoding's default endianness. |
| 303 | type Endianness bool |
| 304 | |
| 305 | const ( |
| 306 | // BigEndian is UTF-16BE. |
| 307 | BigEndian Endianness = false |
| 308 | // LittleEndian is UTF-16LE. |
| 309 | LittleEndian Endianness = true |
| 310 | ) |
| 311 | |
| 312 | // ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a |
| 313 | // starting byte order mark. |
| 314 | var ErrMissingBOM = errors.New("encoding: missing byte order mark") |
| 315 | |
| 316 | type utf16Encoding struct { |
| 317 | config |
| 318 | mib identifier.MIB |
| 319 | } |
| 320 | |
| 321 | type config struct { |
| 322 | endianness Endianness |
| 323 | bomPolicy BOMPolicy |
| 324 | } |
| 325 | |
| 326 | func (u utf16Encoding) NewDecoder() *encoding.Decoder { |
| 327 | return &encoding.Decoder{Transformer: &utf16Decoder{ |
| 328 | initial: u.config, |
| 329 | current: u.config, |
| 330 | }} |
| 331 | } |
| 332 | |
| 333 | func (u utf16Encoding) NewEncoder() *encoding.Encoder { |
| 334 | return &encoding.Encoder{Transformer: &utf16Encoder{ |
| 335 | endianness: u.endianness, |
| 336 | initialBOMPolicy: u.bomPolicy, |
| 337 | currentBOMPolicy: u.bomPolicy, |
| 338 | }} |
| 339 | } |
| 340 | |
| 341 | func (u utf16Encoding) ID() (mib identifier.MIB, other string) { |
| 342 | return u.mib, "" |
| 343 | } |
| 344 | |
| 345 | func (u utf16Encoding) String() string { |
| 346 | e, b := "B", "" |
| 347 | if u.endianness == LittleEndian { |
| 348 | e = "L" |
| 349 | } |
| 350 | switch u.bomPolicy { |
| 351 | case ExpectBOM: |
| 352 | b = "Expect" |
| 353 | case UseBOM: |
| 354 | b = "Use" |
| 355 | case IgnoreBOM: |
| 356 | b = "Ignore" |
| 357 | } |
| 358 | return "UTF-16" + e + "E (" + b + " BOM)" |
| 359 | } |
| 360 | |
| 361 | type utf16Decoder struct { |
| 362 | initial config |
| 363 | current config |
| 364 | } |
| 365 | |
| 366 | func (u *utf16Decoder) Reset() { |
| 367 | u.current = u.initial |
| 368 | } |
| 369 | |
| 370 | func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 371 | if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 { |
| 372 | return 0, 0, ErrMissingBOM |
| 373 | } |
| 374 | if len(src) == 0 { |
| 375 | return 0, 0, nil |
| 376 | } |
| 377 | if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 { |
| 378 | switch { |
| 379 | case src[0] == 0xfe && src[1] == 0xff: |
| 380 | u.current.endianness = BigEndian |
| 381 | nSrc = 2 |
| 382 | case src[0] == 0xff && src[1] == 0xfe: |
| 383 | u.current.endianness = LittleEndian |
| 384 | nSrc = 2 |
| 385 | default: |
| 386 | if u.current.bomPolicy&requireBOM != 0 { |
| 387 | return 0, 0, ErrMissingBOM |
| 388 | } |
| 389 | } |
| 390 | u.current.bomPolicy = IgnoreBOM |
| 391 | } |
| 392 | |
| 393 | var r rune |
| 394 | var dSize, sSize int |
| 395 | for nSrc < len(src) { |
| 396 | if nSrc+1 < len(src) { |
| 397 | x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1]) |
| 398 | if u.current.endianness == LittleEndian { |
| 399 | x = x>>8 | x<<8 |
| 400 | } |
| 401 | r, sSize = rune(x), 2 |
| 402 | if utf16.IsSurrogate(r) { |
| 403 | if nSrc+3 < len(src) { |
| 404 | x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3]) |
| 405 | if u.current.endianness == LittleEndian { |
| 406 | x = x>>8 | x<<8 |
| 407 | } |
| 408 | // Save for next iteration if it is not a high surrogate. |
| 409 | if isHighSurrogate(rune(x)) { |
| 410 | r, sSize = utf16.DecodeRune(r, rune(x)), 4 |
| 411 | } |
| 412 | } else if !atEOF { |
| 413 | err = transform.ErrShortSrc |
| 414 | break |
| 415 | } |
| 416 | } |
| 417 | if dSize = utf8.RuneLen(r); dSize < 0 { |
| 418 | r, dSize = utf8.RuneError, 3 |
| 419 | } |
| 420 | } else if atEOF { |
| 421 | // Single trailing byte. |
| 422 | r, dSize, sSize = utf8.RuneError, 3, 1 |
| 423 | } else { |
| 424 | err = transform.ErrShortSrc |
| 425 | break |
| 426 | } |
| 427 | if nDst+dSize > len(dst) { |
| 428 | err = transform.ErrShortDst |
| 429 | break |
| 430 | } |
| 431 | nDst += utf8.EncodeRune(dst[nDst:], r) |
| 432 | nSrc += sSize |
| 433 | } |
| 434 | return nDst, nSrc, err |
| 435 | } |
| 436 | |
| 437 | func isHighSurrogate(r rune) bool { |
| 438 | return 0xDC00 <= r && r <= 0xDFFF |
| 439 | } |
| 440 | |
| 441 | type utf16Encoder struct { |
| 442 | endianness Endianness |
| 443 | initialBOMPolicy BOMPolicy |
| 444 | currentBOMPolicy BOMPolicy |
| 445 | } |
| 446 | |
| 447 | func (u *utf16Encoder) Reset() { |
| 448 | u.currentBOMPolicy = u.initialBOMPolicy |
| 449 | } |
| 450 | |
| 451 | func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 452 | if u.currentBOMPolicy&writeBOM != 0 { |
| 453 | if len(dst) < 2 { |
| 454 | return 0, 0, transform.ErrShortDst |
| 455 | } |
| 456 | dst[0], dst[1] = 0xfe, 0xff |
| 457 | u.currentBOMPolicy = IgnoreBOM |
| 458 | nDst = 2 |
| 459 | } |
| 460 | |
| 461 | r, size := rune(0), 0 |
| 462 | for nSrc < len(src) { |
| 463 | r = rune(src[nSrc]) |
| 464 | |
| 465 | // Decode a 1-byte rune. |
| 466 | if r < utf8.RuneSelf { |
| 467 | size = 1 |
| 468 | |
| 469 | } else { |
| 470 | // Decode a multi-byte rune. |
| 471 | r, size = utf8.DecodeRune(src[nSrc:]) |
| 472 | if size == 1 { |
| 473 | // All valid runes of size 1 (those below utf8.RuneSelf) were |
| 474 | // handled above. We have invalid UTF-8 or we haven't seen the |
| 475 | // full character yet. |
| 476 | if !atEOF && !utf8.FullRune(src[nSrc:]) { |
| 477 | err = transform.ErrShortSrc |
| 478 | break |
| 479 | } |
| 480 | } |
| 481 | } |
| 482 | |
| 483 | if r <= 0xffff { |
| 484 | if nDst+2 > len(dst) { |
| 485 | err = transform.ErrShortDst |
| 486 | break |
| 487 | } |
| 488 | dst[nDst+0] = uint8(r >> 8) |
| 489 | dst[nDst+1] = uint8(r) |
| 490 | nDst += 2 |
| 491 | } else { |
| 492 | if nDst+4 > len(dst) { |
| 493 | err = transform.ErrShortDst |
| 494 | break |
| 495 | } |
| 496 | r1, r2 := utf16.EncodeRune(r) |
| 497 | dst[nDst+0] = uint8(r1 >> 8) |
| 498 | dst[nDst+1] = uint8(r1) |
| 499 | dst[nDst+2] = uint8(r2 >> 8) |
| 500 | dst[nDst+3] = uint8(r2) |
| 501 | nDst += 4 |
| 502 | } |
| 503 | nSrc += size |
| 504 | } |
| 505 | |
| 506 | if u.endianness == LittleEndian { |
| 507 | for i := 0; i < nDst; i += 2 { |
| 508 | dst[i], dst[i+1] = dst[i+1], dst[i] |
| 509 | } |
| 510 | } |
| 511 | return nDst, nSrc, err |
| 512 | } |