summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/recognizer.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/saintfish/chardet/recognizer.go')
-rw-r--r--vendor/github.com/saintfish/chardet/recognizer.go83
1 files changed, 83 insertions, 0 deletions
diff --git a/vendor/github.com/saintfish/chardet/recognizer.go b/vendor/github.com/saintfish/chardet/recognizer.go
new file mode 100644
index 00000000..1bf8461c
--- /dev/null
+++ b/vendor/github.com/saintfish/chardet/recognizer.go
@@ -0,0 +1,83 @@
+package chardet
+
+type recognizer interface {
+ Match(*recognizerInput) recognizerOutput
+}
+
+type recognizerOutput Result
+
+type recognizerInput struct {
+ raw []byte
+ input []byte
+ tagStripped bool
+ byteStats []int
+ hasC1Bytes bool
+}
+
+func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
+ input, stripped := mayStripInput(raw, stripTag)
+ byteStats := computeByteStats(input)
+ return &recognizerInput{
+ raw: raw,
+ input: input,
+ tagStripped: stripped,
+ byteStats: byteStats,
+ hasC1Bytes: computeHasC1Bytes(byteStats),
+ }
+}
+
+func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
+ const inputBufferSize = 8192
+ out = make([]byte, 0, inputBufferSize)
+ var badTags, openTags int32
+ var inMarkup bool = false
+ stripped = false
+ if stripTag {
+ stripped = true
+ for _, c := range raw {
+ if c == '<' {
+ if inMarkup {
+ badTags += 1
+ }
+ inMarkup = true
+ openTags += 1
+ }
+ if !inMarkup {
+ out = append(out, c)
+ if len(out) >= inputBufferSize {
+ break
+ }
+ }
+ if c == '>' {
+ inMarkup = false
+ }
+ }
+ }
+ if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
+ limit := len(raw)
+ if limit > inputBufferSize {
+ limit = inputBufferSize
+ }
+ out = make([]byte, limit)
+ copy(out, raw[:limit])
+ stripped = false
+ }
+ return
+}
+
+func computeByteStats(input []byte) []int {
+ r := make([]int, 256)
+ for _, c := range input {
+ r[c] += 1
+ }
+ return r
+}
+
+func computeHasC1Bytes(byteStats []int) bool {
+ for _, count := range byteStats[0x80 : 0x9F+1] {
+ if count > 0 {
+ return true
+ }
+ }
+ return false
+}