summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/saintfish/chardet/detector.go
diff options
context:
space:
mode:
authorWim <wim@42.be>2017-07-07 23:34:05 +0200
committerWim <wim@42.be>2017-07-07 23:34:05 +0200
commita0938d93869904ebf6d9938485c248b976150fac (patch)
treea12fad5acdceeec93a28efb600ca62b9fdfb40a5 /vendor/github.com/saintfish/chardet/detector.go
parent2338c69d402ad3779f4e2a2f38ac800ceca656b9 (diff)
downloadmatterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.tar.gz
matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.tar.bz2
matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.zip
Add go-charset and chardet to vendor
Diffstat (limited to 'vendor/github.com/saintfish/chardet/detector.go')
-rw-r--r--vendor/github.com/saintfish/chardet/detector.go136
1 files changed, 136 insertions, 0 deletions
diff --git a/vendor/github.com/saintfish/chardet/detector.go b/vendor/github.com/saintfish/chardet/detector.go
new file mode 100644
index 00000000..e11c222e
--- /dev/null
+++ b/vendor/github.com/saintfish/chardet/detector.go
@@ -0,0 +1,136 @@
+// Package chardet ports character set detection from ICU.
+package chardet
+
+import (
+ "errors"
+ "sort"
+)
+
+// Result contains all the information that charset detector gives.
+type Result struct {
+ // IANA name of the detected charset.
+ Charset string
+ // IANA name of the detected language. It may be empty for some charsets.
+ Language string
+ // Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
+ Confidence int
+}
+
+// Detector implements charset detection.
+type Detector struct {
+ recognizers []recognizer
+ stripTag bool
+}
+
+// List of charset recognizers
+var recognizers = []recognizer{
+ newRecognizer_utf8(),
+ newRecognizer_utf16be(),
+ newRecognizer_utf16le(),
+ newRecognizer_utf32be(),
+ newRecognizer_utf32le(),
+ newRecognizer_8859_1_en(),
+ newRecognizer_8859_1_da(),
+ newRecognizer_8859_1_de(),
+ newRecognizer_8859_1_es(),
+ newRecognizer_8859_1_fr(),
+ newRecognizer_8859_1_it(),
+ newRecognizer_8859_1_nl(),
+ newRecognizer_8859_1_no(),
+ newRecognizer_8859_1_pt(),
+ newRecognizer_8859_1_sv(),
+ newRecognizer_8859_2_cs(),
+ newRecognizer_8859_2_hu(),
+ newRecognizer_8859_2_pl(),
+ newRecognizer_8859_2_ro(),
+ newRecognizer_8859_5_ru(),
+ newRecognizer_8859_6_ar(),
+ newRecognizer_8859_7_el(),
+ newRecognizer_8859_8_I_he(),
+ newRecognizer_8859_8_he(),
+ newRecognizer_windows_1251(),
+ newRecognizer_windows_1256(),
+ newRecognizer_KOI8_R(),
+ newRecognizer_8859_9_tr(),
+
+ newRecognizer_sjis(),
+ newRecognizer_gb_18030(),
+ newRecognizer_euc_jp(),
+ newRecognizer_euc_kr(),
+ newRecognizer_big5(),
+
+ newRecognizer_2022JP(),
+ newRecognizer_2022KR(),
+ newRecognizer_2022CN(),
+
+ newRecognizer_IBM424_he_rtl(),
+ newRecognizer_IBM424_he_ltr(),
+ newRecognizer_IBM420_ar_rtl(),
+ newRecognizer_IBM420_ar_ltr(),
+}
+
+// NewTextDetector creates a Detector for plain text.
+func NewTextDetector() *Detector {
+ return &Detector{recognizers, false}
+}
+
+// NewHtmlDetector creates a Detector for Html.
+func NewHtmlDetector() *Detector {
+ return &Detector{recognizers, true}
+}
+
+var (
+ NotDetectedError = errors.New("Charset not detected.")
+)
+
+// DetectBest returns the Result with highest Confidence.
+func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
+ var all []Result
+ if all, err = d.DetectAll(b); err == nil {
+ r = &all[0]
+ }
+ return
+}
+
+// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
+func (d *Detector) DetectAll(b []byte) ([]Result, error) {
+ input := newRecognizerInput(b, d.stripTag)
+ outputChan := make(chan recognizerOutput)
+ for _, r := range d.recognizers {
+ go matchHelper(r, input, outputChan)
+ }
+ outputs := make([]recognizerOutput, 0, len(d.recognizers))
+ for i := 0; i < len(d.recognizers); i++ {
+ o := <-outputChan
+ if o.Confidence > 0 {
+ outputs = append(outputs, o)
+ }
+ }
+ if len(outputs) == 0 {
+ return nil, NotDetectedError
+ }
+
+ sort.Sort(recognizerOutputs(outputs))
+ dedupOutputs := make([]Result, 0, len(outputs))
+ foundCharsets := make(map[string]struct{}, len(outputs))
+ for _, o := range outputs {
+ if _, found := foundCharsets[o.Charset]; !found {
+ dedupOutputs = append(dedupOutputs, Result(o))
+ foundCharsets[o.Charset] = struct{}{}
+ }
+ }
+ if len(dedupOutputs) == 0 {
+ return nil, NotDetectedError
+ }
+ return dedupOutputs, nil
+}
+
+func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
+ outputChan <- r.Match(input)
+}
+
+type recognizerOutputs []recognizerOutput
+
+func (r recognizerOutputs) Len() int { return len(r) }
+func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
+func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }