summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/paulrosania/go-charset/charset/cp932.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/paulrosania/go-charset/charset/cp932.go')
-rw-r--r--vendor/github.com/paulrosania/go-charset/charset/cp932.go195
1 files changed, 195 insertions, 0 deletions
diff --git a/vendor/github.com/paulrosania/go-charset/charset/cp932.go b/vendor/github.com/paulrosania/go-charset/charset/cp932.go
new file mode 100644
index 00000000..9f46262b
--- /dev/null
+++ b/vendor/github.com/paulrosania/go-charset/charset/cp932.go
@@ -0,0 +1,195 @@
+package charset
+
+import (
+ "fmt"
+ "unicode/utf8"
+)
+
+func init() {
+ registerClass("cp932", fromCP932, nil)
+}
+
+// encoding details
+// (Traditional) Shift-JIS
+//
+// 00..1f control characters
+// 20 space
+// 21..7f JIS X 0201:1976/1997 roman (see notes)
+// 80 undefined
+// 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
+// a0 undefined
+// a1..df JIS X 0201:1976/1997 katakana
+// e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
+// eb..ff undefined
+//
+// CP932 (windows-31J)
+//
+// this encoding scheme extends Shift-JIS in the following way
+//
+// eb..ec undefined (marked as lead bytes - see notes below)
+// ed..ee lead byte of NEC-selected IBM extended characters
+// ef undefined (marked as lead byte - see notes below)
+// f0..f9 lead byte of User defined GAIJI (see note below)
+// fa..fc lead byte of IBM extended characters
+// fd..ff undefined
+//
+//
+// Notes
+//
+// JISX 0201:1976/1997 roman
+// this is the same as ASCII but with 0x5c (ASCII code for '\')
+// representing the Yen currency symbol '¥' (U+00a5)
+// This mapping is contentious, some conversion packages implent it
+// others do not.
+// The mapping files from The Unicode Consortium show cp932 mapping
+// plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
+// symbol (¥) and 0x7e ('~') to overline (¯)
+//
+// CP932 double-byte character codes:
+//
+// eb-ec, ef, f0-f9:
+// Marked as DBCS LEAD BYTEs in the unicode mapping data
+// obtained from:
+// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
+//
+// but there are no defined mappings for codes in this range.
+// It is not clear whether or not an implementation should
+// consume one or two bytes before emitting an error char.
+
+const (
+ kanaPages = 1
+ kanaPageSize = 63
+ kanaChar0 = 0xa1
+
+ cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
+ cp932PageSize = 189 // 40..fc (including 7f)
+ cp932Char0 = 0x40
+)
+
+type jisTables struct {
+ page0 [256]rune
+ dbcsoff [256]int
+ cp932 []rune
+}
+
+type translateFromCP932 struct {
+ tables *jisTables
+ scratch []byte
+}
+
+func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
+ tables := p.tables
+ p.scratch = p.scratch[:0]
+ n := 0
+ for i := 0; i < len(data); i++ {
+ b := data[i]
+ r := tables.page0[b]
+ if r != -1 {
+ p.scratch = appendRune(p.scratch, r)
+ n++
+ continue
+ }
+ // DBCS
+ i++
+ if i >= len(data) {
+ break
+ }
+ pnum := tables.dbcsoff[b]
+ ix := int(data[i]) - cp932Char0
+ if pnum == -1 || ix < 0 || ix >= cp932PageSize {
+ r = utf8.RuneError
+ } else {
+ r = tables.cp932[pnum*cp932PageSize+ix]
+ }
+ p.scratch = appendRune(p.scratch, r)
+ n += 2
+ }
+ return n, p.scratch, nil
+}
+
+type cp932Key bool
+
+func fromCP932(arg string) (Translator, error) {
+ shiftJIS := arg == "shiftjis"
+ tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
+ tables := new(jisTables)
+ kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
+ if err != nil {
+ return nil, err
+ }
+ tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
+ if err != nil {
+ return nil, err
+ }
+
+ // jisx0201kana is mapped into 0xA1..0xDF
+ for i := 0; i < kanaPageSize; i++ {
+ tables.page0[i+kanaChar0] = kana[i]
+ }
+
+ // 00..7f same as ascii in cp932
+ for i := rune(0); i < 0x7f; i++ {
+ tables.page0[i] = i
+ }
+
+ if shiftJIS {
+ // shift-jis uses JIS X 0201 for the ASCII range
+ // this is the same as ASCII apart from
+ // 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
+ tables.page0['\\'] = '¥'
+ tables.page0['~'] = '¯'
+ }
+
+ // pre-calculate DBCS page numbers to mapping file page numbers
+ // and mark codes in page0 that are DBCS lead bytes
+ pnum := 0
+ for i := 0x81; i <= 0x84; i++ {
+ tables.page0[i] = -1
+ tables.dbcsoff[i] = pnum
+ pnum++
+ }
+ for i := 0x87; i <= 0x9f; i++ {
+ tables.page0[i] = -1
+ tables.dbcsoff[i] = pnum
+ pnum++
+ }
+ for i := 0xe0; i <= 0xea; i++ {
+ tables.page0[i] = -1
+ tables.dbcsoff[i] = pnum
+ pnum++
+ }
+ if shiftJIS {
+ return tables, nil
+ }
+ // add in cp932 extensions
+ for i := 0xed; i <= 0xee; i++ {
+ tables.page0[i] = -1
+ tables.dbcsoff[i] = pnum
+ pnum++
+ }
+ for i := 0xfa; i <= 0xfc; i++ {
+ tables.page0[i] = -1
+ tables.dbcsoff[i] = pnum
+ pnum++
+ }
+ return tables, nil
+ })
+
+ if err != nil {
+ return nil, err
+ }
+
+ return &translateFromCP932{tables: tables.(*jisTables)}, nil
+}
+
+func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
+ data, err := readFile(name)
+ if err != nil {
+ return nil, err
+ }
+ m := []rune(string(data))
+ if len(m) != pgsize*npages {
+ return nil, fmt.Errorf("%q: incorrect length data", name)
+ }
+ return m, nil
+}