summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/recognizer.go
blob: 1bf8461c3ebf6cd73ac91a6a26fcbc25ad38ce34 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
package chardet

type recognizer interface {
	Match(*recognizerInput) recognizerOutput
}

type recognizerOutput Result

type recognizerInput struct {
	raw         []byte
	input       []byte
	tagStripped bool
	byteStats   []int
	hasC1Bytes  bool
}

func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
	input, stripped := mayStripInput(raw, stripTag)
	byteStats := computeByteStats(input)
	return &recognizerInput{
		raw:         raw,
		input:       input,
		tagStripped: stripped,
		byteStats:   byteStats,
		hasC1Bytes:  computeHasC1Bytes(byteStats),
	}
}

func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
	const inputBufferSize = 8192
	out = make([]byte, 0, inputBufferSize)
	var badTags, openTags int32
	var inMarkup bool = false
	stripped = false
	if stripTag {
		stripped = true
		for _, c := range raw {
			if c == '<' {
				if inMarkup {
					badTags += 1
				}
				inMarkup = true
				openTags += 1
			}
			if !inMarkup {
				out = append(out, c)
				if len(out) >= inputBufferSize {
					break
				}
			}
			if c == '>' {
				inMarkup = false
			}
		}
	}
	if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
		limit := len(raw)
		if limit > inputBufferSize {
			limit = inputBufferSize
		}
		out = make([]byte, limit)
		copy(out, raw[:limit])
		stripped = false
	}
	return
}

func computeByteStats(input []byte) []int {
	r := make([]int, 256)
	for _, c := range input {
		r[c] += 1
	}
	return r
}

func computeHasC1Bytes(byteStats []int) bool {
	for _, count := range byteStats[0x80 : 0x9F+1] {
		if count > 0 {
			return true
		}
	}
	return false
}