summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/2022.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/saintfish/chardet/2022.go')
-rw-r--r--vendor/github.com/saintfish/chardet/2022.go102
1 files changed, 102 insertions, 0 deletions
diff --git a/vendor/github.com/saintfish/chardet/2022.go b/vendor/github.com/saintfish/chardet/2022.go
new file mode 100644
index 00000000..e667225e
--- /dev/null
+++ b/vendor/github.com/saintfish/chardet/2022.go
@@ -0,0 +1,102 @@
+package chardet
+
+import (
+ "bytes"
+)
+
+type recognizer2022 struct {
+ charset string
+ escapes [][]byte
+}
+
+func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
+ return recognizerOutput{
+ Charset: r.charset,
+ Confidence: r.matchConfidence(input.input),
+ }
+}
+
+func (r *recognizer2022) matchConfidence(input []byte) int {
+ var hits, misses, shifts int
+input:
+ for i := 0; i < len(input); i++ {
+ c := input[i]
+ if c == 0x1B {
+ for _, esc := range r.escapes {
+ if bytes.HasPrefix(input[i+1:], esc) {
+ hits++
+ i += len(esc)
+ continue input
+ }
+ }
+ misses++
+ } else if c == 0x0E || c == 0x0F {
+ shifts++
+ }
+ }
+ if hits == 0 {
+ return 0
+ }
+ quality := (100*hits - 100*misses) / (hits + misses)
+ if hits+shifts < 5 {
+ quality -= (5 - (hits + shifts)) * 10
+ }
+ if quality < 0 {
+ quality = 0
+ }
+ return quality
+}
+
+var escapeSequences_2022JP = [][]byte{
+ {0x24, 0x28, 0x43}, // KS X 1001:1992
+ {0x24, 0x28, 0x44}, // JIS X 212-1990
+ {0x24, 0x40}, // JIS C 6226-1978
+ {0x24, 0x41}, // GB 2312-80
+ {0x24, 0x42}, // JIS X 208-1983
+ {0x26, 0x40}, // JIS X 208 1990, 1997
+ {0x28, 0x42}, // ASCII
+ {0x28, 0x48}, // JIS-Roman
+ {0x28, 0x49}, // Half-width katakana
+ {0x28, 0x4a}, // JIS-Roman
+ {0x2e, 0x41}, // ISO 8859-1
+ {0x2e, 0x46}, // ISO 8859-7
+}
+
+var escapeSequences_2022KR = [][]byte{
+ {0x24, 0x29, 0x43},
+}
+
+var escapeSequences_2022CN = [][]byte{
+ {0x24, 0x29, 0x41}, // GB 2312-80
+ {0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
+ {0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
+ {0x24, 0x29, 0x45}, // ISO-IR-165
+ {0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
+ {0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
+ {0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
+ {0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
+ {0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
+ {0x4e}, // SS2
+ {0x4f}, // SS3
+}
+
+func newRecognizer_2022JP() *recognizer2022 {
+ return &recognizer2022{
+ "ISO-2022-JP",
+ escapeSequences_2022JP,
+ }
+}
+
+func newRecognizer_2022KR() *recognizer2022 {
+ return &recognizer2022{
+ "ISO-2022-KR",
+ escapeSequences_2022KR,
+ }
+}
+
+func newRecognizer_2022CN() *recognizer2022 {
+ return &recognizer2022{
+ "ISO-2022-CN",
+ escapeSequences_2022CN,
+ }
+}