diff options
author | Wim <wim@42.be> | 2017-07-07 23:34:05 +0200 |
---|---|---|
committer | Wim <wim@42.be> | 2017-07-07 23:34:05 +0200 |
commit | a0938d93869904ebf6d9938485c248b976150fac (patch) | |
tree | a12fad5acdceeec93a28efb600ca62b9fdfb40a5 /vendor/github.com/paulrosania/go-charset/charset/cp932.go | |
parent | 2338c69d402ad3779f4e2a2f38ac800ceca656b9 (diff) | |
download | matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.tar.gz matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.tar.bz2 matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.zip |
Add go-charset and chardet to vendor
Diffstat (limited to 'vendor/github.com/paulrosania/go-charset/charset/cp932.go')
-rw-r--r-- | vendor/github.com/paulrosania/go-charset/charset/cp932.go | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/vendor/github.com/paulrosania/go-charset/charset/cp932.go b/vendor/github.com/paulrosania/go-charset/charset/cp932.go new file mode 100644 index 00000000..9f46262b --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/cp932.go @@ -0,0 +1,195 @@ +package charset + +import ( + "fmt" + "unicode/utf8" +) + +func init() { + registerClass("cp932", fromCP932, nil) +} + +// encoding details +// (Traditional) Shift-JIS +// +// 00..1f control characters +// 20 space +// 21..7f JIS X 0201:1976/1997 roman (see notes) +// 80 undefined +// 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 +// a0 undefined +// a1..df JIS X 0201:1976/1997 katakana +// e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 +// eb..ff undefined +// +// CP932 (windows-31J) +// +// this encoding scheme extends Shift-JIS in the following way +// +// eb..ec undefined (marked as lead bytes - see notes below) +// ed..ee lead byte of NEC-selected IBM extended characters +// ef undefined (marked as lead byte - see notes below) +// f0..f9 lead byte of User defined GAIJI (see note below) +// fa..fc lead byte of IBM extended characters +// fd..ff undefined +// +// +// Notes +// +// JISX 0201:1976/1997 roman +// this is the same as ASCII but with 0x5c (ASCII code for '\') +// representing the Yen currency symbol '¥' (U+00a5) +// This mapping is contentious, some conversion packages implent it +// others do not. +// The mapping files from The Unicode Consortium show cp932 mapping +// plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen +// symbol (¥) and 0x7e ('~') to overline (¯) +// +// CP932 double-byte character codes: +// +// eb-ec, ef, f0-f9: +// Marked as DBCS LEAD BYTEs in the unicode mapping data +// obtained from: +// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT +// +// but there are no defined mappings for codes in this range. +// It is not clear whether or not an implementation should +// consume one or two bytes before emitting an error char. + +const ( + kanaPages = 1 + kanaPageSize = 63 + kanaChar0 = 0xa1 + + cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc + cp932PageSize = 189 // 40..fc (including 7f) + cp932Char0 = 0x40 +) + +type jisTables struct { + page0 [256]rune + dbcsoff [256]int + cp932 []rune +} + +type translateFromCP932 struct { + tables *jisTables + scratch []byte +} + +func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) { + tables := p.tables + p.scratch = p.scratch[:0] + n := 0 + for i := 0; i < len(data); i++ { + b := data[i] + r := tables.page0[b] + if r != -1 { + p.scratch = appendRune(p.scratch, r) + n++ + continue + } + // DBCS + i++ + if i >= len(data) { + break + } + pnum := tables.dbcsoff[b] + ix := int(data[i]) - cp932Char0 + if pnum == -1 || ix < 0 || ix >= cp932PageSize { + r = utf8.RuneError + } else { + r = tables.cp932[pnum*cp932PageSize+ix] + } + p.scratch = appendRune(p.scratch, r) + n += 2 + } + return n, p.scratch, nil +} + +type cp932Key bool + +func fromCP932(arg string) (Translator, error) { + shiftJIS := arg == "shiftjis" + tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) { + tables := new(jisTables) + kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages) + if err != nil { + return nil, err + } + tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages) + if err != nil { + return nil, err + } + + // jisx0201kana is mapped into 0xA1..0xDF + for i := 0; i < kanaPageSize; i++ { + tables.page0[i+kanaChar0] = kana[i] + } + + // 00..7f same as ascii in cp932 + for i := rune(0); i < 0x7f; i++ { + tables.page0[i] = i + } + + if shiftJIS { + // shift-jis uses JIS X 0201 for the ASCII range + // this is the same as ASCII apart from + // 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯) + tables.page0['\\'] = '¥' + tables.page0['~'] = '¯' + } + + // pre-calculate DBCS page numbers to mapping file page numbers + // and mark codes in page0 that are DBCS lead bytes + pnum := 0 + for i := 0x81; i <= 0x84; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + for i := 0x87; i <= 0x9f; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + for i := 0xe0; i <= 0xea; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + if shiftJIS { + return tables, nil + } + // add in cp932 extensions + for i := 0xed; i <= 0xee; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + for i := 0xfa; i <= 0xfc; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + return tables, nil + }) + + if err != nil { + return nil, err + } + + return &translateFromCP932{tables: tables.(*jisTables)}, nil +} + +func jisGetMap(name string, pgsize, npages int) ([]rune, error) { + data, err := readFile(name) + if err != nil { + return nil, err + } + m := []rune(string(data)) + if len(m) != pgsize*npages { + return nil, fmt.Errorf("%q: incorrect length data", name) + } + return m, nil +} |