rss-tools/vendor/golang.org/x/text/encoding/charmap/charmap.go (view raw)
| 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | //go:generate go run maketables.go |
| 6 | |
| 7 | // Package charmap provides simple character encodings such as IBM Code Page 437 |
| 8 | // and Windows 1252. |
| 9 | package charmap // import "golang.org/x/text/encoding/charmap" |
| 10 | |
| 11 | import ( |
| 12 | "unicode/utf8" |
| 13 | |
| 14 | "golang.org/x/text/encoding" |
| 15 | "golang.org/x/text/encoding/internal" |
| 16 | "golang.org/x/text/encoding/internal/identifier" |
| 17 | "golang.org/x/text/transform" |
| 18 | ) |
| 19 | |
| 20 | // These encodings vary only in the way clients should interpret them. Their |
| 21 | // coded character set is identical and a single implementation can be shared. |
| 22 | var ( |
| 23 | // ISO8859_6E is the ISO 8859-6E encoding. |
| 24 | ISO8859_6E encoding.Encoding = &iso8859_6E |
| 25 | |
| 26 | // ISO8859_6I is the ISO 8859-6I encoding. |
| 27 | ISO8859_6I encoding.Encoding = &iso8859_6I |
| 28 | |
| 29 | // ISO8859_8E is the ISO 8859-8E encoding. |
| 30 | ISO8859_8E encoding.Encoding = &iso8859_8E |
| 31 | |
| 32 | // ISO8859_8I is the ISO 8859-8I encoding. |
| 33 | ISO8859_8I encoding.Encoding = &iso8859_8I |
| 34 | |
| 35 | iso8859_6E = internal.Encoding{ |
| 36 | Encoding: ISO8859_6, |
| 37 | Name: "ISO-8859-6E", |
| 38 | MIB: identifier.ISO88596E, |
| 39 | } |
| 40 | |
| 41 | iso8859_6I = internal.Encoding{ |
| 42 | Encoding: ISO8859_6, |
| 43 | Name: "ISO-8859-6I", |
| 44 | MIB: identifier.ISO88596I, |
| 45 | } |
| 46 | |
| 47 | iso8859_8E = internal.Encoding{ |
| 48 | Encoding: ISO8859_8, |
| 49 | Name: "ISO-8859-8E", |
| 50 | MIB: identifier.ISO88598E, |
| 51 | } |
| 52 | |
| 53 | iso8859_8I = internal.Encoding{ |
| 54 | Encoding: ISO8859_8, |
| 55 | Name: "ISO-8859-8I", |
| 56 | MIB: identifier.ISO88598I, |
| 57 | } |
| 58 | ) |
| 59 | |
| 60 | // All is a list of all defined encodings in this package. |
| 61 | var All []encoding.Encoding = listAll |
| 62 | |
| 63 | // TODO: implement these encodings, in order of importance. |
| 64 | // ASCII, ISO8859_1: Rather common. Close to Windows 1252. |
| 65 | // ISO8859_9: Close to Windows 1254. |
| 66 | |
| 67 | // utf8Enc holds a rune's UTF-8 encoding in data[:len]. |
| 68 | type utf8Enc struct { |
| 69 | len uint8 |
| 70 | data [3]byte |
| 71 | } |
| 72 | |
| 73 | // Charmap is an 8-bit character set encoding. |
| 74 | type Charmap struct { |
| 75 | // name is the encoding's name. |
| 76 | name string |
| 77 | // mib is the encoding type of this encoder. |
| 78 | mib identifier.MIB |
| 79 | // asciiSuperset states whether the encoding is a superset of ASCII. |
| 80 | asciiSuperset bool |
| 81 | // low is the lower bound of the encoded byte for a non-ASCII rune. If |
| 82 | // Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00. |
| 83 | low uint8 |
| 84 | // replacement is the encoded replacement character. |
| 85 | replacement byte |
| 86 | // decode is the map from encoded byte to UTF-8. |
| 87 | decode [256]utf8Enc |
| 88 | // encoding is the map from runes to encoded bytes. Each entry is a |
| 89 | // uint32: the high 8 bits are the encoded byte and the low 24 bits are |
| 90 | // the rune. The table entries are sorted by ascending rune. |
| 91 | encode [256]uint32 |
| 92 | } |
| 93 | |
| 94 | // NewDecoder implements the encoding.Encoding interface. |
| 95 | func (m *Charmap) NewDecoder() *encoding.Decoder { |
| 96 | return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}} |
| 97 | } |
| 98 | |
| 99 | // NewEncoder implements the encoding.Encoding interface. |
| 100 | func (m *Charmap) NewEncoder() *encoding.Encoder { |
| 101 | return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}} |
| 102 | } |
| 103 | |
| 104 | // String returns the Charmap's name. |
| 105 | func (m *Charmap) String() string { |
| 106 | return m.name |
| 107 | } |
| 108 | |
| 109 | // ID implements an internal interface. |
| 110 | func (m *Charmap) ID() (mib identifier.MIB, other string) { |
| 111 | return m.mib, "" |
| 112 | } |
| 113 | |
| 114 | // charmapDecoder implements transform.Transformer by decoding to UTF-8. |
| 115 | type charmapDecoder struct { |
| 116 | transform.NopResetter |
| 117 | charmap *Charmap |
| 118 | } |
| 119 | |
| 120 | func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 121 | for i, c := range src { |
| 122 | if m.charmap.asciiSuperset && c < utf8.RuneSelf { |
| 123 | if nDst >= len(dst) { |
| 124 | err = transform.ErrShortDst |
| 125 | break |
| 126 | } |
| 127 | dst[nDst] = c |
| 128 | nDst++ |
| 129 | nSrc = i + 1 |
| 130 | continue |
| 131 | } |
| 132 | |
| 133 | decode := &m.charmap.decode[c] |
| 134 | n := int(decode.len) |
| 135 | if nDst+n > len(dst) { |
| 136 | err = transform.ErrShortDst |
| 137 | break |
| 138 | } |
| 139 | // It's 15% faster to avoid calling copy for these tiny slices. |
| 140 | for j := 0; j < n; j++ { |
| 141 | dst[nDst] = decode.data[j] |
| 142 | nDst++ |
| 143 | } |
| 144 | nSrc = i + 1 |
| 145 | } |
| 146 | return nDst, nSrc, err |
| 147 | } |
| 148 | |
| 149 | // DecodeByte returns the Charmap's rune decoding of the byte b. |
| 150 | func (m *Charmap) DecodeByte(b byte) rune { |
| 151 | switch x := &m.decode[b]; x.len { |
| 152 | case 1: |
| 153 | return rune(x.data[0]) |
| 154 | case 2: |
| 155 | return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f) |
| 156 | default: |
| 157 | return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f) |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | // charmapEncoder implements transform.Transformer by encoding from UTF-8. |
| 162 | type charmapEncoder struct { |
| 163 | transform.NopResetter |
| 164 | charmap *Charmap |
| 165 | } |
| 166 | |
| 167 | func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 168 | r, size := rune(0), 0 |
| 169 | loop: |
| 170 | for nSrc < len(src) { |
| 171 | if nDst >= len(dst) { |
| 172 | err = transform.ErrShortDst |
| 173 | break |
| 174 | } |
| 175 | r = rune(src[nSrc]) |
| 176 | |
| 177 | // Decode a 1-byte rune. |
| 178 | if r < utf8.RuneSelf { |
| 179 | if m.charmap.asciiSuperset { |
| 180 | nSrc++ |
| 181 | dst[nDst] = uint8(r) |
| 182 | nDst++ |
| 183 | continue |
| 184 | } |
| 185 | size = 1 |
| 186 | |
| 187 | } else { |
| 188 | // Decode a multi-byte rune. |
| 189 | r, size = utf8.DecodeRune(src[nSrc:]) |
| 190 | if size == 1 { |
| 191 | // All valid runes of size 1 (those below utf8.RuneSelf) were |
| 192 | // handled above. We have invalid UTF-8 or we haven't seen the |
| 193 | // full character yet. |
| 194 | if !atEOF && !utf8.FullRune(src[nSrc:]) { |
| 195 | err = transform.ErrShortSrc |
| 196 | } else { |
| 197 | err = internal.RepertoireError(m.charmap.replacement) |
| 198 | } |
| 199 | break |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | // Binary search in [low, high) for that rune in the m.charmap.encode table. |
| 204 | for low, high := int(m.charmap.low), 0x100; ; { |
| 205 | if low >= high { |
| 206 | err = internal.RepertoireError(m.charmap.replacement) |
| 207 | break loop |
| 208 | } |
| 209 | mid := (low + high) / 2 |
| 210 | got := m.charmap.encode[mid] |
| 211 | gotRune := rune(got & (1<<24 - 1)) |
| 212 | if gotRune < r { |
| 213 | low = mid + 1 |
| 214 | } else if gotRune > r { |
| 215 | high = mid |
| 216 | } else { |
| 217 | dst[nDst] = byte(got >> 24) |
| 218 | nDst++ |
| 219 | break |
| 220 | } |
| 221 | } |
| 222 | nSrc += size |
| 223 | } |
| 224 | return nDst, nSrc, err |
| 225 | } |
| 226 | |
| 227 | // EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether |
| 228 | // r is in the Charmap's repertoire. If not, b is set to the Charmap's |
| 229 | // replacement byte. This is often the ASCII substitute character '\x1a'. |
| 230 | func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) { |
| 231 | if r < utf8.RuneSelf && m.asciiSuperset { |
| 232 | return byte(r), true |
| 233 | } |
| 234 | for low, high := int(m.low), 0x100; ; { |
| 235 | if low >= high { |
| 236 | return m.replacement, false |
| 237 | } |
| 238 | mid := (low + high) / 2 |
| 239 | got := m.encode[mid] |
| 240 | gotRune := rune(got & (1<<24 - 1)) |
| 241 | if gotRune < r { |
| 242 | low = mid + 1 |
| 243 | } else if gotRune > r { |
| 244 | high = mid |
| 245 | } else { |
| 246 | return byte(got >> 24), true |
| 247 | } |
| 248 | } |
| 249 | } |