summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/utf8.go
blob: ae036ad9bcf5d2f84535743d211b1ded32f9419a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package chardet

import (
	"bytes"
)

var utf8Bom = []byte{0xEF, 0xBB, 0xBF}

type recognizerUtf8 struct {
}

func newRecognizer_utf8() *recognizerUtf8 {
	return &recognizerUtf8{}
}

func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
	output = recognizerOutput{
		Charset: "UTF-8",
	}
	hasBom := bytes.HasPrefix(input.raw, utf8Bom)
	inputLen := len(input.raw)
	var numValid, numInvalid uint32
	var trailBytes uint8
	for i := 0; i < inputLen; i++ {
		c := input.raw[i]
		if c&0x80 == 0 {
			continue
		}
		if c&0xE0 == 0xC0 {
			trailBytes = 1
		} else if c&0xF0 == 0xE0 {
			trailBytes = 2
		} else if c&0xF8 == 0xF0 {
			trailBytes = 3
		} else {
			numInvalid++
			if numInvalid > 5 {
				break
			}
			trailBytes = 0
		}

		for i++; i < inputLen; i++ {
			c = input.raw[i]
			if c&0xC0 != 0x80 {
				numInvalid++
				break
			}
			if trailBytes--; trailBytes == 0 {
				numValid++
				break
			}
		}
	}

	if hasBom && numInvalid == 0 {
		output.Confidence = 100
	} else if hasBom && numValid > numInvalid*10 {
		output.Confidence = 80
	} else if numValid > 3 && numInvalid == 0 {
		output.Confidence = 100
	} else if numValid > 0 && numInvalid == 0 {
		output.Confidence = 80
	} else if numValid == 0 && numInvalid == 0 {
		// Plain ASCII
		output.Confidence = 10
	} else if numValid > numInvalid*10 {
		output.Confidence = 25
	}
	return
}