summaryrefslogblamecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/detector.go
blob: e11c222e49f4746feb26ed0669953fc684f99409 (plain) (tree)






































































































































                                                                                                                          
// Package chardet ports character set detection from ICU.
package chardet

import (
	"errors"
	"sort"
)

// Result contains all the information that charset detector gives.
type Result struct {
	// IANA name of the detected charset.
	Charset string
	// IANA name of the detected language. It may be empty for some charsets.
	Language string
	// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
	Confidence int
}

// Detector implements charset detection.
type Detector struct {
	recognizers []recognizer
	stripTag    bool
}

// List of charset recognizers
var recognizers = []recognizer{
	newRecognizer_utf8(),
	newRecognizer_utf16be(),
	newRecognizer_utf16le(),
	newRecognizer_utf32be(),
	newRecognizer_utf32le(),
	newRecognizer_8859_1_en(),
	newRecognizer_8859_1_da(),
	newRecognizer_8859_1_de(),
	newRecognizer_8859_1_es(),
	newRecognizer_8859_1_fr(),
	newRecognizer_8859_1_it(),
	newRecognizer_8859_1_nl(),
	newRecognizer_8859_1_no(),
	newRecognizer_8859_1_pt(),
	newRecognizer_8859_1_sv(),
	newRecognizer_8859_2_cs(),
	newRecognizer_8859_2_hu(),
	newRecognizer_8859_2_pl(),
	newRecognizer_8859_2_ro(),
	newRecognizer_8859_5_ru(),
	newRecognizer_8859_6_ar(),
	newRecognizer_8859_7_el(),
	newRecognizer_8859_8_I_he(),
	newRecognizer_8859_8_he(),
	newRecognizer_windows_1251(),
	newRecognizer_windows_1256(),
	newRecognizer_KOI8_R(),
	newRecognizer_8859_9_tr(),

	newRecognizer_sjis(),
	newRecognizer_gb_18030(),
	newRecognizer_euc_jp(),
	newRecognizer_euc_kr(),
	newRecognizer_big5(),

	newRecognizer_2022JP(),
	newRecognizer_2022KR(),
	newRecognizer_2022CN(),

	newRecognizer_IBM424_he_rtl(),
	newRecognizer_IBM424_he_ltr(),
	newRecognizer_IBM420_ar_rtl(),
	newRecognizer_IBM420_ar_ltr(),
}

// NewTextDetector creates a Detector for plain text.
func NewTextDetector() *Detector {
	return &Detector{recognizers, false}
}

// NewHtmlDetector creates a Detector for Html.
func NewHtmlDetector() *Detector {
	return &Detector{recognizers, true}
}

var (
	NotDetectedError = errors.New("Charset not detected.")
)

// DetectBest returns the Result with highest Confidence.
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
	var all []Result
	if all, err = d.DetectAll(b); err == nil {
		r = &all[0]
	}
	return
}

// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
	input := newRecognizerInput(b, d.stripTag)
	outputChan := make(chan recognizerOutput)
	for _, r := range d.recognizers {
		go matchHelper(r, input, outputChan)
	}
	outputs := make([]recognizerOutput, 0, len(d.recognizers))
	for i := 0; i < len(d.recognizers); i++ {
		o := <-outputChan
		if o.Confidence > 0 {
			outputs = append(outputs, o)
		}
	}
	if len(outputs) == 0 {
		return nil, NotDetectedError
	}

	sort.Sort(recognizerOutputs(outputs))
	dedupOutputs := make([]Result, 0, len(outputs))
	foundCharsets := make(map[string]struct{}, len(outputs))
	for _, o := range outputs {
		if _, found := foundCharsets[o.Charset]; !found {
			dedupOutputs = append(dedupOutputs, Result(o))
			foundCharsets[o.Charset] = struct{}{}
		}
	}
	if len(dedupOutputs) == 0 {
		return nil, NotDetectedError
	}
	return dedupOutputs, nil
}

func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
	outputChan <- r.Match(input)
}

type recognizerOutputs []recognizerOutput

func (r recognizerOutputs) Len() int           { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }