summaryrefslogblamecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/recognizer.go
blob: 1bf8461c3ebf6cd73ac91a6a26fcbc25ad38ce34 (plain) (tree)

















































































                                                                                       
package chardet

type recognizer interface {
	Match(*recognizerInput) recognizerOutput
}

type recognizerOutput Result

type recognizerInput struct {
	raw         []byte
	input       []byte
	tagStripped bool
	byteStats   []int
	hasC1Bytes  bool
}

func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
	input, stripped := mayStripInput(raw, stripTag)
	byteStats := computeByteStats(input)
	return &recognizerInput{
		raw:         raw,
		input:       input,
		tagStripped: stripped,
		byteStats:   byteStats,
		hasC1Bytes:  computeHasC1Bytes(byteStats),
	}
}

func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
	const inputBufferSize = 8192
	out = make([]byte, 0, inputBufferSize)
	var badTags, openTags int32
	var inMarkup bool = false
	stripped = false
	if stripTag {
		stripped = true
		for _, c := range raw {
			if c == '<' {
				if inMarkup {
					badTags += 1
				}
				inMarkup = true
				openTags += 1
			}
			if !inMarkup {
				out = append(out, c)
				if len(out) >= inputBufferSize {
					break
				}
			}
			if c == '>' {
				inMarkup = false
			}
		}
	}
	if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
		limit := len(raw)
		if limit > inputBufferSize {
			limit = inputBufferSize
		}
		out = make([]byte, limit)
		copy(out, raw[:limit])
		stripped = false
	}
	return
}

func computeByteStats(input []byte) []int {
	r := make([]int, 256)
	for _, c := range input {
		r[c] += 1
	}
	return r
}

func computeHasC1Bytes(byteStats []int) bool {
	for _, count := range byteStats[0x80 : 0x9F+1] {
		if count > 0 {
			return true
		}
	}
	return false
}