summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/paulrosania/go-charset/charset/cp932.go
blob: 9f46262badb54b7a551ef04fe98213dcddc6bfb3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
package charset

import (
	"fmt"
	"unicode/utf8"
)

func init() {
	registerClass("cp932", fromCP932, nil)
}

// encoding details
// (Traditional) Shift-JIS
//
// 00..1f	control characters
// 20		space
// 21..7f	JIS X 0201:1976/1997 roman (see notes)
// 80		undefined
// 81..9f	lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
// a0		undefined
// a1..df	JIS X 0201:1976/1997 katakana
// e0..ea	lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
// eb..ff	undefined
//
// CP932 (windows-31J)
//
// this encoding scheme extends Shift-JIS in the following way
//
// eb..ec	undefined (marked as lead bytes - see notes below)
// ed..ee	lead byte of NEC-selected IBM extended characters
// ef		undefined (marked as lead byte - see notes below)
// f0..f9	lead byte of User defined GAIJI (see note below)
// fa..fc	lead byte of IBM extended characters
// fd..ff	undefined
//
//
// Notes
//
// JISX 0201:1976/1997 roman
//	this is the same as ASCII but with 0x5c (ASCII code for '\')
//	representing the Yen currency symbol '¥' (U+00a5)
//	This mapping is contentious, some conversion packages implent it
//	others do not.
//	The mapping files from The Unicode Consortium show cp932 mapping
//	plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
//	symbol (¥) and 0x7e ('~') to overline (¯)
//
// CP932 double-byte character codes:
//
// eb-ec, ef, f0-f9:
// 	Marked as DBCS LEAD BYTEs in the unicode mapping data
//	obtained from:
//		https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
//
// 	but there are no defined mappings for codes in this range.
// 	It is not clear whether or not an implementation should
// 	consume one or two bytes before emitting an error char.

const (
	kanaPages    = 1
	kanaPageSize = 63
	kanaChar0    = 0xa1

	cp932Pages    = 45  // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
	cp932PageSize = 189 // 40..fc (including 7f)
	cp932Char0    = 0x40
)

type jisTables struct {
	page0   [256]rune
	dbcsoff [256]int
	cp932   []rune
}

type translateFromCP932 struct {
	tables  *jisTables
	scratch []byte
}

func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
	tables := p.tables
	p.scratch = p.scratch[:0]
	n := 0
	for i := 0; i < len(data); i++ {
		b := data[i]
		r := tables.page0[b]
		if r != -1 {
			p.scratch = appendRune(p.scratch, r)
			n++
			continue
		}
		// DBCS
		i++
		if i >= len(data) {
			break
		}
		pnum := tables.dbcsoff[b]
		ix := int(data[i]) - cp932Char0
		if pnum == -1 || ix < 0 || ix >= cp932PageSize {
			r = utf8.RuneError
		} else {
			r = tables.cp932[pnum*cp932PageSize+ix]
		}
		p.scratch = appendRune(p.scratch, r)
		n += 2
	}
	return n, p.scratch, nil
}

type cp932Key bool

func fromCP932(arg string) (Translator, error) {
	shiftJIS := arg == "shiftjis"
	tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
		tables := new(jisTables)
		kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
		if err != nil {
			return nil, err
		}
		tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
		if err != nil {
			return nil, err
		}

		// jisx0201kana is mapped into 0xA1..0xDF
		for i := 0; i < kanaPageSize; i++ {
			tables.page0[i+kanaChar0] = kana[i]
		}

		// 00..7f same as ascii in cp932
		for i := rune(0); i < 0x7f; i++ {
			tables.page0[i] = i
		}

		if shiftJIS {
			// shift-jis uses JIS X 0201 for the ASCII range
			// this is the same as ASCII apart from
			// 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
			tables.page0['\\'] = '¥'
			tables.page0['~'] = '¯'
		}

		// pre-calculate DBCS page numbers to mapping file page numbers
		// and mark codes in page0 that are DBCS lead bytes
		pnum := 0
		for i := 0x81; i <= 0x84; i++ {
			tables.page0[i] = -1
			tables.dbcsoff[i] = pnum
			pnum++
		}
		for i := 0x87; i <= 0x9f; i++ {
			tables.page0[i] = -1
			tables.dbcsoff[i] = pnum
			pnum++
		}
		for i := 0xe0; i <= 0xea; i++ {
			tables.page0[i] = -1
			tables.dbcsoff[i] = pnum
			pnum++
		}
		if shiftJIS {
			return tables, nil
		}
		// add in cp932 extensions
		for i := 0xed; i <= 0xee; i++ {
			tables.page0[i] = -1
			tables.dbcsoff[i] = pnum
			pnum++
		}
		for i := 0xfa; i <= 0xfc; i++ {
			tables.page0[i] = -1
			tables.dbcsoff[i] = pnum
			pnum++
		}
		return tables, nil
	})

	if err != nil {
		return nil, err
	}

	return &translateFromCP932{tables: tables.(*jisTables)}, nil
}

func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
	data, err := readFile(name)
	if err != nil {
		return nil, err
	}
	m := []rune(string(data))
	if len(m) != pgsize*npages {
		return nil, fmt.Errorf("%q: incorrect length data", name)
	}
	return m, nil
}