summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/unicode.go
blob: 6f9fa9e67fd13e6f76c34ef93b50680f31d074c5 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
package chardet

import (
	"bytes"
)

var (
	utf16beBom = []byte{0xFE, 0xFF}
	utf16leBom = []byte{0xFF, 0xFE}
	utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
	utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
)

type recognizerUtf16be struct {
}

func newRecognizer_utf16be() *recognizerUtf16be {
	return &recognizerUtf16be{}
}

func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
	output = recognizerOutput{
		Charset: "UTF-16BE",
	}
	if bytes.HasPrefix(input.raw, utf16beBom) {
		output.Confidence = 100
	}
	return
}

type recognizerUtf16le struct {
}

func newRecognizer_utf16le() *recognizerUtf16le {
	return &recognizerUtf16le{}
}

func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
	output = recognizerOutput{
		Charset: "UTF-16LE",
	}
	if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
		output.Confidence = 100
	}
	return
}

type recognizerUtf32 struct {
	name       string
	bom        []byte
	decodeChar func(input []byte) uint32
}

func decodeUtf32be(input []byte) uint32 {
	return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
}

func decodeUtf32le(input []byte) uint32 {
	return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
}

func newRecognizer_utf32be() *recognizerUtf32 {
	return &recognizerUtf32{
		"UTF-32BE",
		utf32beBom,
		decodeUtf32be,
	}
}

func newRecognizer_utf32le() *recognizerUtf32 {
	return &recognizerUtf32{
		"UTF-32LE",
		utf32leBom,
		decodeUtf32le,
	}
}

func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
	output = recognizerOutput{
		Charset: r.name,
	}
	hasBom := bytes.HasPrefix(input.raw, r.bom)
	var numValid, numInvalid uint32
	for b := input.raw; len(b) >= 4; b = b[4:] {
		if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
			numInvalid++
		} else {
			numValid++
		}
	}
	if hasBom && numInvalid == 0 {
		output.Confidence = 100
	} else if hasBom && numValid > numInvalid*10 {
		output.Confidence = 80
	} else if numValid > 3 && numInvalid == 0 {
		output.Confidence = 100
	} else if numValid > 0 && numInvalid == 0 {
		output.Confidence = 80
	} else if numValid > numInvalid*10 {
		output.Confidence = 25
	}
	return
}