From a0938d93869904ebf6d9938485c248b976150fac Mon Sep 17 00:00:00 2001 From: Wim Date: Fri, 7 Jul 2017 23:34:05 +0200 Subject: Add go-charset and chardet to vendor --- .../paulrosania/go-charset/charset/iconv/iconv.go | 184 +++++++++++++++++++++ .../go-charset/charset/iconv/list_query.go | 80 +++++++++ .../go-charset/charset/iconv/list_static.go | 176 ++++++++++++++++++++ 3 files changed, 440 insertions(+) create mode 100644 vendor/github.com/paulrosania/go-charset/charset/iconv/iconv.go create mode 100644 vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go create mode 100644 vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go (limited to 'vendor/github.com/paulrosania/go-charset/charset/iconv') diff --git a/vendor/github.com/paulrosania/go-charset/charset/iconv/iconv.go b/vendor/github.com/paulrosania/go-charset/charset/iconv/iconv.go new file mode 100644 index 00000000..f7187f5f --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/iconv/iconv.go @@ -0,0 +1,184 @@ +// The iconv package provides an interface to the GNU iconv character set +// conversion library (see http://www.gnu.org/software/libiconv/). +// It automatically registers all the character sets with the charset package, +// so it is usually used simply for the side effects of importing it. +// Example: +// import ( +// "go-charset.googlecode.com/hg/charset" +// _ "go-charset.googlecode.com/hg/charset/iconv" +// ) +package iconv + +//#cgo darwin LDFLAGS: -liconv +//#include +//#include +//#include +//iconv_t iconv_open_error = (iconv_t)-1; +//size_t iconv_error = (size_t)-1; +import "C" +import ( + "errors" + "fmt" + "github.com/paulrosania/go-charset/charset" + "runtime" + "strings" + "syscall" + "unicode/utf8" + "unsafe" +) + +type iconvTranslator struct { + cd C.iconv_t + invalid rune + scratch []byte +} + +func canonicalChar(c rune) rune { + if c >= 'a' && c <= 'z' { + return c - 'a' + 'A' + } + return c +} + +func canonicalName(s string) string { + return strings.Map(canonicalChar, s) +} + +func init() { + charset.Register(iconvFactory{}) +} + +type iconvFactory struct { +} + +func (iconvFactory) TranslatorFrom(name string) (charset.Translator, error) { + return Translator("UTF-8", name, utf8.RuneError) +} + +func (iconvFactory) TranslatorTo(name string) (charset.Translator, error) { + // BUG This is wrong. The target character set may not be ASCII + // compatible. There's no easy solution to this other than + // removing the offending code point. + return Translator(name, "UTF-8", '?') +} + +// Translator returns a Translator that translates between +// the named character sets. When an invalid multibyte +// character is found, the bytes in invalid are substituted instead. +func Translator(toCharset, fromCharset string, invalid rune) (charset.Translator, error) { + cto, cfrom := C.CString(toCharset), C.CString(fromCharset) + cd, err := C.iconv_open(cto, cfrom) + + C.free(unsafe.Pointer(cfrom)) + C.free(unsafe.Pointer(cto)) + + if cd == C.iconv_open_error { + if err == syscall.EINVAL { + return nil, errors.New("iconv: conversion not supported") + } + return nil, err + } + t := &iconvTranslator{cd: cd, invalid: invalid} + runtime.SetFinalizer(t, func(*iconvTranslator) { + C.iconv_close(cd) + }) + return t, nil +} + +func (iconvFactory) Names() []string { + all := aliases() + names := make([]string, 0, len(all)) + for name, aliases := range all { + if aliases[0] == name { + names = append(names, name) + } + } + return names +} + +func (iconvFactory) Info(name string) *charset.Charset { + name = strings.ToLower(name) + all := aliases() + a, ok := all[name] + if !ok { + return nil + } + return &charset.Charset{ + Name: name, + Aliases: a, + } +} + +func (p *iconvTranslator) Translate(data []byte, eof bool) (rn int, rd []byte, rerr error) { + n := 0 + p.scratch = p.scratch[:0] + for len(data) > 0 { + p.scratch = ensureCap(p.scratch, len(p.scratch)+len(data)*utf8.UTFMax) + cData := (*C.char)(unsafe.Pointer(&data[:1][0])) + nData := C.size_t(len(data)) + + ns := len(p.scratch) + cScratch := (*C.char)(unsafe.Pointer(&p.scratch[ns : ns+1][0])) + nScratch := C.size_t(cap(p.scratch) - ns) + r, err := C.iconv(p.cd, &cData, &nData, &cScratch, &nScratch) + + p.scratch = p.scratch[0 : cap(p.scratch)-int(nScratch)] + n += len(data) - int(nData) + data = data[len(data)-int(nData):] + + if r != C.iconv_error || err == nil { + return n, p.scratch, nil + } + switch err := err.(syscall.Errno); err { + case C.EILSEQ: + // invalid multibyte sequence - skip one byte and continue + p.scratch = appendRune(p.scratch, p.invalid) + n++ + data = data[1:] + case C.EINVAL: + // incomplete multibyte sequence + return n, p.scratch, nil + case C.E2BIG: + // output buffer not large enough; try again with larger buffer. + p.scratch = ensureCap(p.scratch, cap(p.scratch)+utf8.UTFMax) + default: + panic(fmt.Sprintf("unexpected error code: %v", err)) + } + } + return n, p.scratch, nil +} + +// ensureCap returns s with a capacity of at least n bytes. +// If cap(s) < n, then it returns a new copy of s with the +// required capacity. +func ensureCap(s []byte, n int) []byte { + if n <= cap(s) { + return s + } + // logic adapted from appendslice1 in runtime + m := cap(s) + if m == 0 { + m = n + } else { + for { + if m < 1024 { + m += m + } else { + m += m / 4 + } + if m >= n { + break + } + } + } + t := make([]byte, len(s), m) + copy(t, s) + return t +} + +func appendRune(buf []byte, r rune) []byte { + n := len(buf) + buf = ensureCap(buf, n+utf8.UTFMax) + nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r) + return buf[0 : n+nu] +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go new file mode 100644 index 00000000..cda03270 --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go @@ -0,0 +1,80 @@ +// +build !linux +// This file is systemdependent because not all versions +// of iconv have the iconvlist function. + +package iconv + +//#cgo darwin LDFLAGS: -liconv +//#cgo freebsd LDFLAGS: -liconv +//#cgo windows LDFLAGS: -liconv +//#include +//#include +//#include +//#include +// +//typedef struct nameList nameList; +//struct nameList { +// int n; +// char **names; +// nameList *next; +//}; +// +//int +//addNames(unsigned int n, const char *const *names, void *data) { +// // we can't call back to Go because of the stack size issue, +// // so copy all the names. +// nameList *hd, *e; +// int i; +// +// hd = data; +// e = malloc(sizeof(nameList)); +// e->n = n; +// e->names = malloc(sizeof(char*) * n); +// for(i = 0; i < n; i++){ +// e->names[i] = strdup(names[i]); +// } +// e->next = hd->next; +// hd->next = e; +// return 0; +//} +// +//nameList * +//listNames(void) { +// nameList hd; +// hd.next = 0; +// iconvlist(addNames, &hd); +// return hd.next; +//} +import "C" + +import ( + "strings" + "sync" + "unsafe" +) + +var getAliasesOnce sync.Once +var allAliases = map[string][]string{} + +func aliases() map[string][]string { + getAliasesOnce.Do(getAliases) + return allAliases +} + +func getAliases() { + var next *C.nameList + for p := C.listNames(); p != nil; p = next { + next = p.next + aliases := make([]string, p.n) + pnames := (*[1e9]*C.char)(unsafe.Pointer(p.names)) + for i := range aliases { + aliases[i] = strings.ToLower(C.GoString(pnames[i])) + C.free(unsafe.Pointer(pnames[i])) + } + C.free(unsafe.Pointer(p.names)) + C.free(unsafe.Pointer(p)) + for _, alias := range aliases { + allAliases[alias] = aliases + } + } +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go new file mode 100644 index 00000000..edf9e28a --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go @@ -0,0 +1,176 @@ +// +build linux + +// We just use a list of names obtained from iconv on a platform +// that allows iconvlist. We could invoke the iconv command, +// but that might fail too, and it gives no information about aliases. + +package iconv + +import ( + "sync" +) + +func aliases() map[string][]string { + initAliasesOnce.Do(initAliases) + return allAliases +} + +var initAliasesOnce sync.Once +var allAliases map[string][]string + +func initAliases() { + allAliases = make(map[string][]string) + for _, a := range aliasData { + for _, alias := range a { + allAliases[alias] = a + } + } +} + +var aliasData = [][]string{ + {"437", "cp437", "ibm437", "cspc8codepage437"}, + {"850", "cp850", "ibm850", "cspc850multilingual"}, + {"852", "cp852", "ibm852", "cspcp852"}, + {"855", "cp855", "ibm855", "csibm855"}, + {"857", "cp857", "ibm857", "csibm857"}, + {"860", "cp860", "ibm860", "csibm860"}, + {"861", "cp-is", "cp861", "ibm861", "csibm861"}, + {"862", "cp862", "ibm862", "cspc862latinhebrew"}, + {"863", "cp863", "ibm863", "csibm863"}, + {"865", "cp865", "ibm865", "csibm865"}, + {"866", "cp866", "ibm866", "csibm866"}, + {"869", "cp-gr", "cp869", "ibm869", "csibm869"}, + {"ansi-x3.4-1968", "ansi-x3.4-1986", "ascii", "cp367", "ibm367", "iso-ir-6", "iso646-us", "iso-646.irv:1991", "us", "us-ascii", "csascii"}, + {"arabic", "asmo-708", "ecma-114", "iso-8859-6", "iso-ir-127", "iso8859-6", "iso-8859-6", "iso-8859-6:1987", "csisolatinarabic"}, + {"armscii-8"}, + {"atari", "atarist"}, + {"big5-2003"}, + {"big-5", "big-five", "big5", "bigfive", "cn-big5", "csbig5"}, + {"big5-hkscs:1999"}, + {"big5-hkscs:2001"}, + {"big5-hkscs", "big5-hkscs:2004", "big5hkscs"}, + {"c99"}, + {"chinese", "gb-2312-80", "iso-ir-58", "csiso58gb231280"}, + {"cn", "gb-1988-80", "iso-ir-57", "iso646-cn", "csiso57gb1988"}, + {"cn-gb", "euc-cn", "euccn", "gb2312", "csgb2312"}, + {"cn-gb-isoir165", "iso-ir-165"}, + {"cp1046"}, + {"cp1124"}, + {"cp1125"}, + {"cp1129"}, + {"cp1131"}, + {"cp1133", "ibm-cp1133"}, + {"cp1161", "ibm-1161", "ibm1161", "csibm1161"}, + {"cp1162", "ibm-1162", "ibm1162", "csibm1162"}, + {"cp1163", "ibm-1163", "ibm1163", "csibm1163"}, + {"cp1250", "ms-ee", "windows-1250"}, + {"cp1251", "ms-cyrl", "windows-1251"}, + {"cp1252", "ms-ansi", "windows-1252"}, + {"cp1253", "ms-greek", "windows-1253"}, + {"cp1254", "ms-turk", "windows-1254"}, + {"cp1255", "ms-hebr", "windows-1255"}, + {"cp1256", "ms-arab", "windows-1256"}, + {"cp1257", "winbaltrim", "windows-1257"}, + {"cp1258", "windows-1258"}, + {"cp1361", "johab"}, + {"cp154", "cyrillic-asian", "pt154", "ptcp154", "csptcp154"}, + {"cp737"}, + {"cp775", "ibm775", "cspc775baltic"}, + {"cp819", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso-8859-1", "iso-8859-1:1987", "l1", "latin1", "csisolatin1"}, + {"cp853"}, + {"cp856"}, + {"cp858"}, + {"cp864", "ibm864", "csibm864"}, + {"cp874", "windows-874"}, + {"cp922"}, + {"cp932"}, + {"cp936", "ms936", "windows-936"}, + {"cp943"}, + {"cp949", "uhc"}, + {"cp950"}, + {"cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso-8859-5", "iso-8859-5:1988", "csisolatincyrillic"}, + {"dec-hanyu"}, + {"dec-kanji"}, + {"ecma-118", "elot-928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso-8859-7", "iso-8859-7:1987", "iso-8859-7:2003", "csisolatingreek"}, + {"euc-jis-2004", "euc-jisx0213"}, + {"euc-jp", "eucjp", "extended-unix-code-packed-format-for-japanese", "cseucpkdfmtjapanese"}, + {"euc-kr", "euckr", "cseuckr"}, + {"euc-tw", "euctw", "cseuctw"}, + {"gb18030"}, + {"gbk"}, + {"georgian-academy"}, + {"georgian-ps"}, + {"hebrew", "iso-8859-8", "iso-ir-138", "iso8859-8", "iso-8859-8", "iso-8859-8:1988", "csisolatinhebrew"}, + {"hp-roman8", "r8", "roman8", "cshproman8"}, + {"hz", "hz-gb-2312"}, + {"iso-10646-ucs-2", "ucs-2", "csunicode"}, + {"iso-10646-ucs-4", "ucs-4", "csucs4"}, + {"iso-2022-cn", "csiso2022cn"}, + {"iso-2022-cn-ext"}, + {"iso-2022-jp-1"}, + {"iso-2022-jp-2004", "iso-2022-jp-3"}, + {"iso-2022-jp-2", "csiso2022jp2"}, + {"iso-2022-jp", "csiso2022jp"}, + {"iso-2022-kr", "csiso2022kr"}, + {"iso-8859-10", "iso-ir-157", "iso8859-10", "iso-8859-10", "iso-8859-10:1992", "l6", "latin6", "csisolatin6"}, + {"iso-8859-11", "iso8859-11", "iso-8859-11"}, + {"iso-8859-13", "iso-ir-179", "iso8859-13", "iso-8859-13", "l7", "latin7"}, + {"iso-8859-14", "iso-celtic", "iso-ir-199", "iso8859-14", "iso-8859-14", "iso-8859-14:1998", "l8", "latin8"}, + {"iso-8859-15", "iso-ir-203", "iso8859-15", "iso-8859-15", "iso-8859-15:1998", "latin-9"}, + {"iso-8859-16", "iso-ir-226", "iso8859-16", "iso-8859-16", "iso-8859-16:2001", "l10", "latin10"}, + {"iso-8859-2", "iso-ir-101", "iso8859-2", "iso-8859-2", "iso-8859-2:1987", "l2", "latin2", "csisolatin2"}, + {"iso-8859-3", "iso-ir-109", "iso8859-3", "iso-8859-3", "iso-8859-3:1988", "l3", "latin3", "csisolatin3"}, + {"iso-8859-4", "iso-ir-110", "iso8859-4", "iso-8859-4", "iso-8859-4:1988", "l4", "latin4", "csisolatin4"}, + {"iso-8859-9", "iso-ir-148", "iso8859-9", "iso-8859-9", "iso-8859-9:1989", "l5", "latin5", "csisolatin5"}, + {"iso-ir-149", "korean", "ksc-5601", "ks-c-5601-1987", "ks-c-5601-1989", "csksc56011987"}, + {"iso-ir-14", "iso646-jp", "jis-c6220-1969-ro", "jp", "csiso14jisc6220ro"}, + {"iso-ir-159", "jis-x0212", "jis-x0212-1990", "jis-x0212.1990-0", "x0212", "csiso159jisx02121990"}, + {"iso-ir-166", "tis-620", "tis620", "tis620-0", "tis620.2529-1", "tis620.2533-0", "tis620.2533-1"}, + {"iso-ir-230", "tds565"}, + {"iso-ir-87", "jis0208", "jis-c6226-1983", "jis-x0208", "jis-x0208-1983", "jis-x0208-1990", "x0208", "csiso87jisx0208"}, + {"java"}, + {"jisx0201-1976", "jis-x0201", "x0201", "cshalfwidthkatakana"}, + {"koi8-r", "cskoi8r"}, + {"koi8-ru"}, + {"koi8-t"}, + {"koi8-u"}, + {"kz-1048", "rk1048", "strk1048-2002", "cskz1048"}, + {"macarabic"}, + {"maccentraleurope"}, + {"maccroatian"}, + {"maccyrillic"}, + {"macgreek"}, + {"machebrew"}, + {"maciceland"}, + {"mac", "macintosh", "macroman", "csmacintosh"}, + {"macromania"}, + {"macthai"}, + {"macturkish"}, + {"macukraine"}, + {"ms-kanji", "shift-jis", "shift-jis", "sjis", "csshiftjis"}, + {" MS-Windows", "Japanese", "(cp932)"}, + {"mulelao-1"}, + {"nextstep"}, + {"riscos-latin1"}, + {"shift-jis-2004", "shift-jisx0213"}, + {"tcvn", "tcvn-5712", "tcvn5712-1", "tcvn5712-1:1993"}, + {"ucs-2be", "unicode-1-1", "unicodebig", "csunicode11"}, + {"ucs-2-internal"}, + {"ucs-2le", "unicodelittle"}, + {"ucs-2-swapped"}, + {"ucs-4be"}, + {"ucs-4-internal"}, + {"ucs-4le"}, + {"ucs-4-swapped"}, + {"unicode-1-1-utf-7", "utf-7", "csunicode11utf7"}, + {"utf-16"}, + {"utf-16be"}, + {"utf-16le"}, + {"utf-32"}, + {"utf-32be"}, + {"utf-32le"}, + {"utf-8"}, + {"utf-8-mac", "utf8-mac"}, + {"viscii", "viscii1.1-1", "csviscii"}, + {"windows-31j", "cp932"}, +} -- cgit v1.2.3