diff options
author | Wim <wim@42.be> | 2017-07-07 23:34:05 +0200 |
---|---|---|
committer | Wim <wim@42.be> | 2017-07-07 23:34:05 +0200 |
commit | a0938d93869904ebf6d9938485c248b976150fac (patch) | |
tree | a12fad5acdceeec93a28efb600ca62b9fdfb40a5 /vendor/github.com/paulrosania/go-charset/charset | |
parent | 2338c69d402ad3779f4e2a2f38ac800ceca656b9 (diff) | |
download | matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.tar.gz matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.tar.bz2 matterbridge-msglm-a0938d93869904ebf6d9938485c248b976150fac.zip |
Add go-charset and chardet to vendor
Diffstat (limited to 'vendor/github.com/paulrosania/go-charset/charset')
12 files changed, 1585 insertions, 0 deletions
diff --git a/vendor/github.com/paulrosania/go-charset/charset/ascii.go b/vendor/github.com/paulrosania/go-charset/charset/ascii.go new file mode 100644 index 00000000..ccf3a35b --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/ascii.go @@ -0,0 +1,65 @@ +package charset + +import ( + "bytes" + "fmt" + "unicode/utf8" +) + +func init() { + registerClass("ascii", fromASCII, toASCII) +} + +const errorByte = '?' + +type translateFromASCII bool + +type codePointError struct { + i int + cp rune + charset string +} + +func (e *codePointError) Error() string { + return fmt.Sprintf("Parse error at index %n: Code point %n is undefined in %s", e.i, e.cp, e.charset) +} + +func (strict translateFromASCII) Translate(data []byte, eof bool) (int, []byte, error) { + buf := bytes.NewBuffer(make([]byte, 0, len(data))) + for i, c := range data { + if c > 0 && c < 128 { + buf.WriteByte(c) + if c < 32 && c != 10 && c != 13 && c != 9 { + // badly formed + } + } else { + if strict { + return 0, nil, &codePointError{i, rune(c), "US-ASCII"} + } + buf.WriteRune(utf8.RuneError) + } + } + return len(data), buf.Bytes(), nil +} + +type translateToASCII bool + +func (strict translateToASCII) Translate(data []byte, eof bool) (int, []byte, error) { + buf := bytes.NewBuffer(make([]byte, 0, len(data))) + for _, c := range data { + if c > 0 && c < 128 { + buf.WriteByte(c) + } else { + buf.WriteByte(errorByte) + } + } + return len(data), buf.Bytes(), nil +} + +func fromASCII(arg string) (Translator, error) { + return new(translateFromASCII), nil +} + +func toASCII(arg string) (Translator, error) { + return new(translateToASCII), nil +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/big5.go b/vendor/github.com/paulrosania/go-charset/charset/big5.go new file mode 100644 index 00000000..e01fa1af --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/big5.go @@ -0,0 +1,88 @@ +package charset + +import ( + "fmt" + "unicode/utf8" +) + +func init() { + registerClass("big5", fromBig5, nil) +} + +// Big5 consists of 89 fonts of 157 chars each +const ( + big5Max = 13973 + big5Font = 157 + big5Data = "big5.dat" +) + +type translateFromBig5 struct { + font int + scratch []byte + big5map []rune +} + +func (p *translateFromBig5) Translate(data []byte, eof bool) (int, []byte, error) { + p.scratch = p.scratch[:0] + n := 0 + for len(data) > 0 { + c := int(data[0]) + data = data[1:] + n++ + if p.font == -1 { + // idle state + if c >= 0xa1 { + p.font = c + continue + } + if c == 26 { + c = '\n' + } + continue + } + f := p.font + p.font = -1 + r := utf8.RuneError + switch { + case c >= 64 && c <= 126: + c -= 64 + case c >= 161 && c <= 254: + c = c - 161 + 63 + default: + // bad big5 char + f = 255 + } + if f <= 254 { + f -= 161 + ix := f*big5Font + c + if ix < len(p.big5map) { + r = p.big5map[ix] + } + if r == -1 { + r = utf8.RuneError + } + } + p.scratch = appendRune(p.scratch, r) + } + return n, p.scratch, nil +} + +type big5Key bool + +func fromBig5(arg string) (Translator, error) { + big5map, err := cache(big5Key(false), func() (interface{}, error) { + data, err := readFile(big5Data) + if err != nil { + return nil, fmt.Errorf("charset: cannot open big5 data file: %v", err) + } + big5map := []rune(string(data)) + if len(big5map) != big5Max { + return nil, fmt.Errorf("charset: corrupt big5 data") + } + return big5map, nil + }) + if err != nil { + return nil, err + } + return &translateFromBig5{big5map: big5map.([]rune), font: -1}, nil +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/charset.go b/vendor/github.com/paulrosania/go-charset/charset/charset.go new file mode 100644 index 00000000..6ab6cf89 --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/charset.go @@ -0,0 +1,301 @@ +// The charset package implements translation between character sets. +// It uses Unicode as the intermediate representation. +// Because it can be large, the character set data is separated +// from the charset package. It can be embedded in the Go +// executable by importing the data package: +// +// import _ "github.com/paulrosania/go-charset/data" +// +// It can also made available in a data directory (by settting CharsetDir). +package charset + +import ( + "io" + "strings" + "unicode/utf8" +) + +// Charset holds information about a given character set. +type Charset struct { + Name string // Canonical name of character set. + Aliases []string // Known aliases. + Desc string // Description. + NoFrom bool // Not possible to translate from this charset. + NoTo bool // Not possible to translate to this charset. +} + +// Translator represents a character set converter. +// The Translate method translates the given data, +// and returns the number of bytes of data consumed, +// a slice containing the converted data (which may be +// overwritten on the next call to Translate), and any +// conversion error. If eof is true, the data represents +// the final bytes of the input. +type Translator interface { + Translate(data []byte, eof bool) (n int, cdata []byte, err error) +} + +// A Factory can be used to make character set translators. +type Factory interface { + // TranslatorFrom creates a translator that will translate from the named character + // set to UTF-8. + TranslatorFrom(name string) (Translator, error) // Create a Translator from this character set to. + + // TranslatorTo creates a translator that will translate from UTF-8 to the named character set. + TranslatorTo(name string) (Translator, error) // Create a Translator To this character set. + + // Names returns all the character set names accessibile through the factory. + Names() []string + + // Info returns information on the named character set. It returns nil if the + // factory doesn't recognise the given name. + Info(name string) *Charset +} + +var factories = []Factory{localFactory{}} + +// Register registers a new Factory which will be consulted when NewReader +// or NewWriter needs a character set translator for a given name. +func Register(factory Factory) { + factories = append(factories, factory) +} + +// NewReader returns a new Reader that translates from the named +// character set to UTF-8 as it reads r. +func NewReader(charset string, r io.Reader) (io.Reader, error) { + tr, err := TranslatorFrom(charset) + if err != nil { + return nil, err + } + return NewTranslatingReader(r, tr), nil +} + +// NewWriter returns a new WriteCloser writing to w. It converts writes +// of UTF-8 text into writes on w of text in the named character set. +// The Close is necessary to flush any remaining partially translated +// characters to the output. +func NewWriter(charset string, w io.Writer) (io.WriteCloser, error) { + tr, err := TranslatorTo(charset) + if err != nil { + return nil, err + } + return NewTranslatingWriter(w, tr), nil +} + +// Info returns information about a character set, or nil +// if the character set is not found. +func Info(name string) *Charset { + for _, f := range factories { + if info := f.Info(name); info != nil { + return info + } + } + return nil +} + +// Names returns the canonical names of all supported character sets, in alphabetical order. +func Names() []string { + // TODO eliminate duplicates + var names []string + for _, f := range factories { + names = append(names, f.Names()...) + } + return names +} + +// TranslatorFrom returns a translator that will translate from +// the named character set to UTF-8. +func TranslatorFrom(charset string) (Translator, error) { + var err error + var tr Translator + for _, f := range factories { + tr, err = f.TranslatorFrom(charset) + if err == nil { + break + } + } + if tr == nil { + return nil, err + } + return tr, nil +} + +// TranslatorTo returns a translator that will translate from UTF-8 +// to the named character set. +func TranslatorTo(charset string) (Translator, error) { + var err error + var tr Translator + for _, f := range factories { + tr, err = f.TranslatorTo(charset) + if err == nil { + break + } + } + if tr == nil { + return nil, err + } + return tr, nil +} + +func normalizedChar(c rune) rune { + switch { + case c >= 'A' && c <= 'Z': + c = c - 'A' + 'a' + case c == '_': + c = '-' + } + return c +} + +// NormalisedName returns s with all Roman capitals +// mapped to lower case, and '_' mapped to '-' +func NormalizedName(s string) string { + return strings.Map(normalizedChar, s) +} + +type translatingWriter struct { + w io.Writer + tr Translator + buf []byte // unconsumed data from writer. +} + +// NewTranslatingWriter returns a new WriteCloser writing to w. +// It passes the written bytes through the given Translator. +func NewTranslatingWriter(w io.Writer, tr Translator) io.WriteCloser { + return &translatingWriter{w: w, tr: tr} +} + +func (w *translatingWriter) Write(data []byte) (rn int, rerr error) { + wdata := data + if len(w.buf) > 0 { + w.buf = append(w.buf, data...) + wdata = w.buf + } + n, cdata, err := w.tr.Translate(wdata, false) + if err != nil { + // TODO + } + if n > 0 { + _, err = w.w.Write(cdata) + if err != nil { + return 0, err + } + } + w.buf = w.buf[:0] + if n < len(wdata) { + w.buf = append(w.buf, wdata[n:]...) + } + return len(data), nil +} + +func (p *translatingWriter) Close() error { + for { + n, data, err := p.tr.Translate(p.buf, true) + p.buf = p.buf[n:] + if err != nil { + // TODO + } + // If the Translator produces no data + // at EOF, then assume that it never will. + if len(data) == 0 { + break + } + n, err = p.w.Write(data) + if err != nil { + return err + } + if n < len(data) { + return io.ErrShortWrite + } + if len(p.buf) == 0 { + break + } + } + return nil +} + +type translatingReader struct { + r io.Reader + tr Translator + cdata []byte // unconsumed data from converter. + rdata []byte // unconverted data from reader. + err error // final error from reader. +} + +// NewTranslatingReader returns a new Reader that +// translates data using the given Translator as it reads r. +func NewTranslatingReader(r io.Reader, tr Translator) io.Reader { + return &translatingReader{r: r, tr: tr} +} + +func (r *translatingReader) Read(buf []byte) (int, error) { + for { + if len(r.cdata) > 0 { + n := copy(buf, r.cdata) + r.cdata = r.cdata[n:] + return n, nil + } + if r.err == nil { + r.rdata = ensureCap(r.rdata, len(r.rdata)+len(buf)) + n, err := r.r.Read(r.rdata[len(r.rdata):cap(r.rdata)]) + // Guard against non-compliant Readers. + if n == 0 && err == nil { + err = io.EOF + } + r.rdata = r.rdata[0 : len(r.rdata)+n] + r.err = err + } else if len(r.rdata) == 0 { + break + } + nc, cdata, cvterr := r.tr.Translate(r.rdata, r.err != nil) + if cvterr != nil { + // TODO + } + r.cdata = cdata + + // Ensure that we consume all bytes at eof + // if the converter refuses them. + if nc == 0 && r.err != nil { + nc = len(r.rdata) + } + + // Copy unconsumed data to the start of the rdata buffer. + r.rdata = r.rdata[0:copy(r.rdata, r.rdata[nc:])] + } + return 0, r.err +} + +// ensureCap returns s with a capacity of at least n bytes. +// If cap(s) < n, then it returns a new copy of s with the +// required capacity. +func ensureCap(s []byte, n int) []byte { + if n <= cap(s) { + return s + } + // logic adapted from appendslice1 in runtime + m := cap(s) + if m == 0 { + m = n + } else { + for { + if m < 1024 { + m += m + } else { + m += m / 4 + } + if m >= n { + break + } + } + } + t := make([]byte, len(s), m) + copy(t, s) + return t +} + +func appendRune(buf []byte, r rune) []byte { + n := len(buf) + buf = ensureCap(buf, n+utf8.UTFMax) + nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r) + return buf[0 : n+nu] +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/codepage.go b/vendor/github.com/paulrosania/go-charset/charset/codepage.go new file mode 100644 index 00000000..6864c875 --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/codepage.go @@ -0,0 +1,133 @@ +package charset + +import ( + "fmt" + "unicode/utf8" +) + +func init() { + registerClass("cp", fromCodePage, toCodePage) +} + +type translateFromCodePage struct { + byte2rune *[256]rune + scratch []byte +} + +type cpKeyFrom string +type cpKeyTo string + +func (p *translateFromCodePage) Translate(data []byte, eof bool) (int, []byte, error) { + p.scratch = ensureCap(p.scratch, len(data)*utf8.UTFMax)[:0] + buf := p.scratch + for _, x := range data { + r := p.byte2rune[x] + if r < utf8.RuneSelf { + buf = append(buf, byte(r)) + continue + } + size := utf8.EncodeRune(buf[len(buf):cap(buf)], r) + buf = buf[0 : len(buf)+size] + } + return len(data), buf, nil +} + +type toCodePageInfo struct { + rune2byte map[rune]byte + // same gives the number of runes at start of code page that map exactly to + // unicode. + same rune +} + +type translateToCodePage struct { + toCodePageInfo + scratch []byte +} + +func (p *translateToCodePage) Translate(data []byte, eof bool) (int, []byte, error) { + p.scratch = ensureCap(p.scratch, len(data)) + buf := p.scratch[:0] + + for i := 0; i < len(data); { + r := rune(data[i]) + size := 1 + if r >= utf8.RuneSelf { + r, size = utf8.DecodeRune(data[i:]) + if size == 1 && !eof && !utf8.FullRune(data[i:]) { + return i, buf, nil + } + } + + var b byte + if r < p.same { + b = byte(r) + } else { + var ok bool + b, ok = p.rune2byte[r] + if !ok { + b = '?' + } + } + buf = append(buf, b) + i += size + } + return len(data), buf, nil +} + +func fromCodePage(arg string) (Translator, error) { + runes, err := cache(cpKeyFrom(arg), func() (interface{}, error) { + data, err := readFile(arg) + if err != nil { + return nil, err + } + runes := []rune(string(data)) + if len(runes) != 256 { + return nil, fmt.Errorf("charset: %q has wrong rune count (%d)", arg, len(runes)) + } + r := new([256]rune) + copy(r[:], runes) + return r, nil + }) + if err != nil { + return nil, err + } + return &translateFromCodePage{byte2rune: runes.(*[256]rune)}, nil +} + +func toCodePage(arg string) (Translator, error) { + m, err := cache(cpKeyTo(arg), func() (interface{}, error) { + data, err := readFile(arg) + if err != nil { + return nil, err + } + + info := toCodePageInfo{ + rune2byte: make(map[rune]byte), + same: 256, + } + atStart := true + i := rune(0) + for _, r := range string(data) { + if atStart { + if r == i { + i++ + continue + } + info.same = i + atStart = false + } + info.rune2byte[r] = byte(i) + i++ + } + // TODO fix tables + // fmt.Printf("%s, same = %d\n", arg, info.same) + if i != 256 { + return nil, fmt.Errorf("charset: %q has wrong rune count (%d)", arg, i) + } + return info, nil + }) + if err != nil { + return nil, err + } + return &translateToCodePage{toCodePageInfo: m.(toCodePageInfo)}, nil +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/cp932.go b/vendor/github.com/paulrosania/go-charset/charset/cp932.go new file mode 100644 index 00000000..9f46262b --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/cp932.go @@ -0,0 +1,195 @@ +package charset + +import ( + "fmt" + "unicode/utf8" +) + +func init() { + registerClass("cp932", fromCP932, nil) +} + +// encoding details +// (Traditional) Shift-JIS +// +// 00..1f control characters +// 20 space +// 21..7f JIS X 0201:1976/1997 roman (see notes) +// 80 undefined +// 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 +// a0 undefined +// a1..df JIS X 0201:1976/1997 katakana +// e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 +// eb..ff undefined +// +// CP932 (windows-31J) +// +// this encoding scheme extends Shift-JIS in the following way +// +// eb..ec undefined (marked as lead bytes - see notes below) +// ed..ee lead byte of NEC-selected IBM extended characters +// ef undefined (marked as lead byte - see notes below) +// f0..f9 lead byte of User defined GAIJI (see note below) +// fa..fc lead byte of IBM extended characters +// fd..ff undefined +// +// +// Notes +// +// JISX 0201:1976/1997 roman +// this is the same as ASCII but with 0x5c (ASCII code for '\') +// representing the Yen currency symbol '¥' (U+00a5) +// This mapping is contentious, some conversion packages implent it +// others do not. +// The mapping files from The Unicode Consortium show cp932 mapping +// plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen +// symbol (¥) and 0x7e ('~') to overline (¯) +// +// CP932 double-byte character codes: +// +// eb-ec, ef, f0-f9: +// Marked as DBCS LEAD BYTEs in the unicode mapping data +// obtained from: +// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT +// +// but there are no defined mappings for codes in this range. +// It is not clear whether or not an implementation should +// consume one or two bytes before emitting an error char. + +const ( + kanaPages = 1 + kanaPageSize = 63 + kanaChar0 = 0xa1 + + cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc + cp932PageSize = 189 // 40..fc (including 7f) + cp932Char0 = 0x40 +) + +type jisTables struct { + page0 [256]rune + dbcsoff [256]int + cp932 []rune +} + +type translateFromCP932 struct { + tables *jisTables + scratch []byte +} + +func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) { + tables := p.tables + p.scratch = p.scratch[:0] + n := 0 + for i := 0; i < len(data); i++ { + b := data[i] + r := tables.page0[b] + if r != -1 { + p.scratch = appendRune(p.scratch, r) + n++ + continue + } + // DBCS + i++ + if i >= len(data) { + break + } + pnum := tables.dbcsoff[b] + ix := int(data[i]) - cp932Char0 + if pnum == -1 || ix < 0 || ix >= cp932PageSize { + r = utf8.RuneError + } else { + r = tables.cp932[pnum*cp932PageSize+ix] + } + p.scratch = appendRune(p.scratch, r) + n += 2 + } + return n, p.scratch, nil +} + +type cp932Key bool + +func fromCP932(arg string) (Translator, error) { + shiftJIS := arg == "shiftjis" + tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) { + tables := new(jisTables) + kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages) + if err != nil { + return nil, err + } + tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages) + if err != nil { + return nil, err + } + + // jisx0201kana is mapped into 0xA1..0xDF + for i := 0; i < kanaPageSize; i++ { + tables.page0[i+kanaChar0] = kana[i] + } + + // 00..7f same as ascii in cp932 + for i := rune(0); i < 0x7f; i++ { + tables.page0[i] = i + } + + if shiftJIS { + // shift-jis uses JIS X 0201 for the ASCII range + // this is the same as ASCII apart from + // 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯) + tables.page0['\\'] = '¥' + tables.page0['~'] = '¯' + } + + // pre-calculate DBCS page numbers to mapping file page numbers + // and mark codes in page0 that are DBCS lead bytes + pnum := 0 + for i := 0x81; i <= 0x84; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + for i := 0x87; i <= 0x9f; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + for i := 0xe0; i <= 0xea; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + if shiftJIS { + return tables, nil + } + // add in cp932 extensions + for i := 0xed; i <= 0xee; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + for i := 0xfa; i <= 0xfc; i++ { + tables.page0[i] = -1 + tables.dbcsoff[i] = pnum + pnum++ + } + return tables, nil + }) + + if err != nil { + return nil, err + } + + return &translateFromCP932{tables: tables.(*jisTables)}, nil +} + +func jisGetMap(name string, pgsize, npages int) ([]rune, error) { + data, err := readFile(name) + if err != nil { + return nil, err + } + m := []rune(string(data)) + if len(m) != pgsize*npages { + return nil, fmt.Errorf("%q: incorrect length data", name) + } + return m, nil +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/file.go b/vendor/github.com/paulrosania/go-charset/charset/file.go new file mode 100644 index 00000000..a0c26225 --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/file.go @@ -0,0 +1,40 @@ +package charset + +import ( + "io" + "io/ioutil" + "os" + "path/filepath" +) + +var files = make(map[string]func() (io.ReadCloser, error)) + +// RegisterDataFile registers the existence of a given data +// file with the given name that may be used by a character-set converter. +// It is intended to be used by packages that wish to embed +// data in the executable binary, and should not be +// used normally. +func RegisterDataFile(name string, open func() (io.ReadCloser, error)) { + files[name] = open +} + +// CharsetDir gives the location of the default data file directory. +// This directory will be used for files with names that have not +// been registered with RegisterDataFile. +var CharsetDir = "/usr/local/lib/go-charset/datafiles" + +func readFile(name string) (data []byte, err error) { + var r io.ReadCloser + if open := files[name]; open != nil { + r, err = open() + if err != nil { + return + } + } else { + r, err = os.Open(filepath.Join(CharsetDir, name)) + if err != nil { + return + } + } + return ioutil.ReadAll(r) +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/iconv/iconv.go b/vendor/github.com/paulrosania/go-charset/charset/iconv/iconv.go new file mode 100644 index 00000000..f7187f5f --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/iconv/iconv.go @@ -0,0 +1,184 @@ +// The iconv package provides an interface to the GNU iconv character set +// conversion library (see http://www.gnu.org/software/libiconv/). +// It automatically registers all the character sets with the charset package, +// so it is usually used simply for the side effects of importing it. +// Example: +// import ( +// "go-charset.googlecode.com/hg/charset" +// _ "go-charset.googlecode.com/hg/charset/iconv" +// ) +package iconv + +//#cgo darwin LDFLAGS: -liconv +//#include <stdlib.h> +//#include <iconv.h> +//#include <errno.h> +//iconv_t iconv_open_error = (iconv_t)-1; +//size_t iconv_error = (size_t)-1; +import "C" +import ( + "errors" + "fmt" + "github.com/paulrosania/go-charset/charset" + "runtime" + "strings" + "syscall" + "unicode/utf8" + "unsafe" +) + +type iconvTranslator struct { + cd C.iconv_t + invalid rune + scratch []byte +} + +func canonicalChar(c rune) rune { + if c >= 'a' && c <= 'z' { + return c - 'a' + 'A' + } + return c +} + +func canonicalName(s string) string { + return strings.Map(canonicalChar, s) +} + +func init() { + charset.Register(iconvFactory{}) +} + +type iconvFactory struct { +} + +func (iconvFactory) TranslatorFrom(name string) (charset.Translator, error) { + return Translator("UTF-8", name, utf8.RuneError) +} + +func (iconvFactory) TranslatorTo(name string) (charset.Translator, error) { + // BUG This is wrong. The target character set may not be ASCII + // compatible. There's no easy solution to this other than + // removing the offending code point. + return Translator(name, "UTF-8", '?') +} + +// Translator returns a Translator that translates between +// the named character sets. When an invalid multibyte +// character is found, the bytes in invalid are substituted instead. +func Translator(toCharset, fromCharset string, invalid rune) (charset.Translator, error) { + cto, cfrom := C.CString(toCharset), C.CString(fromCharset) + cd, err := C.iconv_open(cto, cfrom) + + C.free(unsafe.Pointer(cfrom)) + C.free(unsafe.Pointer(cto)) + + if cd == C.iconv_open_error { + if err == syscall.EINVAL { + return nil, errors.New("iconv: conversion not supported") + } + return nil, err + } + t := &iconvTranslator{cd: cd, invalid: invalid} + runtime.SetFinalizer(t, func(*iconvTranslator) { + C.iconv_close(cd) + }) + return t, nil +} + +func (iconvFactory) Names() []string { + all := aliases() + names := make([]string, 0, len(all)) + for name, aliases := range all { + if aliases[0] == name { + names = append(names, name) + } + } + return names +} + +func (iconvFactory) Info(name string) *charset.Charset { + name = strings.ToLower(name) + all := aliases() + a, ok := all[name] + if !ok { + return nil + } + return &charset.Charset{ + Name: name, + Aliases: a, + } +} + +func (p *iconvTranslator) Translate(data []byte, eof bool) (rn int, rd []byte, rerr error) { + n := 0 + p.scratch = p.scratch[:0] + for len(data) > 0 { + p.scratch = ensureCap(p.scratch, len(p.scratch)+len(data)*utf8.UTFMax) + cData := (*C.char)(unsafe.Pointer(&data[:1][0])) + nData := C.size_t(len(data)) + + ns := len(p.scratch) + cScratch := (*C.char)(unsafe.Pointer(&p.scratch[ns : ns+1][0])) + nScratch := C.size_t(cap(p.scratch) - ns) + r, err := C.iconv(p.cd, &cData, &nData, &cScratch, &nScratch) + + p.scratch = p.scratch[0 : cap(p.scratch)-int(nScratch)] + n += len(data) - int(nData) + data = data[len(data)-int(nData):] + + if r != C.iconv_error || err == nil { + return n, p.scratch, nil + } + switch err := err.(syscall.Errno); err { + case C.EILSEQ: + // invalid multibyte sequence - skip one byte and continue + p.scratch = appendRune(p.scratch, p.invalid) + n++ + data = data[1:] + case C.EINVAL: + // incomplete multibyte sequence + return n, p.scratch, nil + case C.E2BIG: + // output buffer not large enough; try again with larger buffer. + p.scratch = ensureCap(p.scratch, cap(p.scratch)+utf8.UTFMax) + default: + panic(fmt.Sprintf("unexpected error code: %v", err)) + } + } + return n, p.scratch, nil +} + +// ensureCap returns s with a capacity of at least n bytes. +// If cap(s) < n, then it returns a new copy of s with the +// required capacity. +func ensureCap(s []byte, n int) []byte { + if n <= cap(s) { + return s + } + // logic adapted from appendslice1 in runtime + m := cap(s) + if m == 0 { + m = n + } else { + for { + if m < 1024 { + m += m + } else { + m += m / 4 + } + if m >= n { + break + } + } + } + t := make([]byte, len(s), m) + copy(t, s) + return t +} + +func appendRune(buf []byte, r rune) []byte { + n := len(buf) + buf = ensureCap(buf, n+utf8.UTFMax) + nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r) + return buf[0 : n+nu] +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go new file mode 100644 index 00000000..cda03270 --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go @@ -0,0 +1,80 @@ +// +build !linux +// This file is systemdependent because not all versions +// of iconv have the iconvlist function. + +package iconv + +//#cgo darwin LDFLAGS: -liconv +//#cgo freebsd LDFLAGS: -liconv +//#cgo windows LDFLAGS: -liconv +//#include <stdlib.h> +//#include <string.h> +//#include <iconv.h> +//#include <errno.h> +// +//typedef struct nameList nameList; +//struct nameList { +// int n; +// char **names; +// nameList *next; +//}; +// +//int +//addNames(unsigned int n, const char *const *names, void *data) { +// // we can't call back to Go because of the stack size issue, +// // so copy all the names. +// nameList *hd, *e; +// int i; +// +// hd = data; +// e = malloc(sizeof(nameList)); +// e->n = n; +// e->names = malloc(sizeof(char*) * n); +// for(i = 0; i < n; i++){ +// e->names[i] = strdup(names[i]); +// } +// e->next = hd->next; +// hd->next = e; +// return 0; +//} +// +//nameList * +//listNames(void) { +// nameList hd; +// hd.next = 0; +// iconvlist(addNames, &hd); +// return hd.next; +//} +import "C" + +import ( + "strings" + "sync" + "unsafe" +) + +var getAliasesOnce sync.Once +var allAliases = map[string][]string{} + +func aliases() map[string][]string { + getAliasesOnce.Do(getAliases) + return allAliases +} + +func getAliases() { + var next *C.nameList + for p := C.listNames(); p != nil; p = next { + next = p.next + aliases := make([]string, p.n) + pnames := (*[1e9]*C.char)(unsafe.Pointer(p.names)) + for i := range aliases { + aliases[i] = strings.ToLower(C.GoString(pnames[i])) + C.free(unsafe.Pointer(pnames[i])) + } + C.free(unsafe.Pointer(p.names)) + C.free(unsafe.Pointer(p)) + for _, alias := range aliases { + allAliases[alias] = aliases + } + } +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go new file mode 100644 index 00000000..edf9e28a --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go @@ -0,0 +1,176 @@ +// +build linux + +// We just use a list of names obtained from iconv on a platform +// that allows iconvlist. We could invoke the iconv command, +// but that might fail too, and it gives no information about aliases. + +package iconv + +import ( + "sync" +) + +func aliases() map[string][]string { + initAliasesOnce.Do(initAliases) + return allAliases +} + +var initAliasesOnce sync.Once +var allAliases map[string][]string + +func initAliases() { + allAliases = make(map[string][]string) + for _, a := range aliasData { + for _, alias := range a { + allAliases[alias] = a + } + } +} + +var aliasData = [][]string{ + {"437", "cp437", "ibm437", "cspc8codepage437"}, + {"850", "cp850", "ibm850", "cspc850multilingual"}, + {"852", "cp852", "ibm852", "cspcp852"}, + {"855", "cp855", "ibm855", "csibm855"}, + {"857", "cp857", "ibm857", "csibm857"}, + {"860", "cp860", "ibm860", "csibm860"}, + {"861", "cp-is", "cp861", "ibm861", "csibm861"}, + {"862", "cp862", "ibm862", "cspc862latinhebrew"}, + {"863", "cp863", "ibm863", "csibm863"}, + {"865", "cp865", "ibm865", "csibm865"}, + {"866", "cp866", "ibm866", "csibm866"}, + {"869", "cp-gr", "cp869", "ibm869", "csibm869"}, + {"ansi-x3.4-1968", "ansi-x3.4-1986", "ascii", "cp367", "ibm367", "iso-ir-6", "iso646-us", "iso-646.irv:1991", "us", "us-ascii", "csascii"}, + {"arabic", "asmo-708", "ecma-114", "iso-8859-6", "iso-ir-127", "iso8859-6", "iso-8859-6", "iso-8859-6:1987", "csisolatinarabic"}, + {"armscii-8"}, + {"atari", "atarist"}, + {"big5-2003"}, + {"big-5", "big-five", "big5", "bigfive", "cn-big5", "csbig5"}, + {"big5-hkscs:1999"}, + {"big5-hkscs:2001"}, + {"big5-hkscs", "big5-hkscs:2004", "big5hkscs"}, + {"c99"}, + {"chinese", "gb-2312-80", "iso-ir-58", "csiso58gb231280"}, + {"cn", "gb-1988-80", "iso-ir-57", "iso646-cn", "csiso57gb1988"}, + {"cn-gb", "euc-cn", "euccn", "gb2312", "csgb2312"}, + {"cn-gb-isoir165", "iso-ir-165"}, + {"cp1046"}, + {"cp1124"}, + {"cp1125"}, + {"cp1129"}, + {"cp1131"}, + {"cp1133", "ibm-cp1133"}, + {"cp1161", "ibm-1161", "ibm1161", "csibm1161"}, + {"cp1162", "ibm-1162", "ibm1162", "csibm1162"}, + {"cp1163", "ibm-1163", "ibm1163", "csibm1163"}, + {"cp1250", "ms-ee", "windows-1250"}, + {"cp1251", "ms-cyrl", "windows-1251"}, + {"cp1252", "ms-ansi", "windows-1252"}, + {"cp1253", "ms-greek", "windows-1253"}, + {"cp1254", "ms-turk", "windows-1254"}, + {"cp1255", "ms-hebr", "windows-1255"}, + {"cp1256", "ms-arab", "windows-1256"}, + {"cp1257", "winbaltrim", "windows-1257"}, + {"cp1258", "windows-1258"}, + {"cp1361", "johab"}, + {"cp154", "cyrillic-asian", "pt154", "ptcp154", "csptcp154"}, + {"cp737"}, + {"cp775", "ibm775", "cspc775baltic"}, + {"cp819", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso-8859-1", "iso-8859-1:1987", "l1", "latin1", "csisolatin1"}, + {"cp853"}, + {"cp856"}, + {"cp858"}, + {"cp864", "ibm864", "csibm864"}, + {"cp874", "windows-874"}, + {"cp922"}, + {"cp932"}, + {"cp936", "ms936", "windows-936"}, + {"cp943"}, + {"cp949", "uhc"}, + {"cp950"}, + {"cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso-8859-5", "iso-8859-5:1988", "csisolatincyrillic"}, + {"dec-hanyu"}, + {"dec-kanji"}, + {"ecma-118", "elot-928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso-8859-7", "iso-8859-7:1987", "iso-8859-7:2003", "csisolatingreek"}, + {"euc-jis-2004", "euc-jisx0213"}, + {"euc-jp", "eucjp", "extended-unix-code-packed-format-for-japanese", "cseucpkdfmtjapanese"}, + {"euc-kr", "euckr", "cseuckr"}, + {"euc-tw", "euctw", "cseuctw"}, + {"gb18030"}, + {"gbk"}, + {"georgian-academy"}, + {"georgian-ps"}, + {"hebrew", "iso-8859-8", "iso-ir-138", "iso8859-8", "iso-8859-8", "iso-8859-8:1988", "csisolatinhebrew"}, + {"hp-roman8", "r8", "roman8", "cshproman8"}, + {"hz", "hz-gb-2312"}, + {"iso-10646-ucs-2", "ucs-2", "csunicode"}, + {"iso-10646-ucs-4", "ucs-4", "csucs4"}, + {"iso-2022-cn", "csiso2022cn"}, + {"iso-2022-cn-ext"}, + {"iso-2022-jp-1"}, + {"iso-2022-jp-2004", "iso-2022-jp-3"}, + {"iso-2022-jp-2", "csiso2022jp2"}, + {"iso-2022-jp", "csiso2022jp"}, + {"iso-2022-kr", "csiso2022kr"}, + {"iso-8859-10", "iso-ir-157", "iso8859-10", "iso-8859-10", "iso-8859-10:1992", "l6", "latin6", "csisolatin6"}, + {"iso-8859-11", "iso8859-11", "iso-8859-11"}, + {"iso-8859-13", "iso-ir-179", "iso8859-13", "iso-8859-13", "l7", "latin7"}, + {"iso-8859-14", "iso-celtic", "iso-ir-199", "iso8859-14", "iso-8859-14", "iso-8859-14:1998", "l8", "latin8"}, + {"iso-8859-15", "iso-ir-203", "iso8859-15", "iso-8859-15", "iso-8859-15:1998", "latin-9"}, + {"iso-8859-16", "iso-ir-226", "iso8859-16", "iso-8859-16", "iso-8859-16:2001", "l10", "latin10"}, + {"iso-8859-2", "iso-ir-101", "iso8859-2", "iso-8859-2", "iso-8859-2:1987", "l2", "latin2", "csisolatin2"}, + {"iso-8859-3", "iso-ir-109", "iso8859-3", "iso-8859-3", "iso-8859-3:1988", "l3", "latin3", "csisolatin3"}, + {"iso-8859-4", "iso-ir-110", "iso8859-4", "iso-8859-4", "iso-8859-4:1988", "l4", "latin4", "csisolatin4"}, + {"iso-8859-9", "iso-ir-148", "iso8859-9", "iso-8859-9", "iso-8859-9:1989", "l5", "latin5", "csisolatin5"}, + {"iso-ir-149", "korean", "ksc-5601", "ks-c-5601-1987", "ks-c-5601-1989", "csksc56011987"}, + {"iso-ir-14", "iso646-jp", "jis-c6220-1969-ro", "jp", "csiso14jisc6220ro"}, + {"iso-ir-159", "jis-x0212", "jis-x0212-1990", "jis-x0212.1990-0", "x0212", "csiso159jisx02121990"}, + {"iso-ir-166", "tis-620", "tis620", "tis620-0", "tis620.2529-1", "tis620.2533-0", "tis620.2533-1"}, + {"iso-ir-230", "tds565"}, + {"iso-ir-87", "jis0208", "jis-c6226-1983", "jis-x0208", "jis-x0208-1983", "jis-x0208-1990", "x0208", "csiso87jisx0208"}, + {"java"}, + {"jisx0201-1976", "jis-x0201", "x0201", "cshalfwidthkatakana"}, + {"koi8-r", "cskoi8r"}, + {"koi8-ru"}, + {"koi8-t"}, + {"koi8-u"}, + {"kz-1048", "rk1048", "strk1048-2002", "cskz1048"}, + {"macarabic"}, + {"maccentraleurope"}, + {"maccroatian"}, + {"maccyrillic"}, + {"macgreek"}, + {"machebrew"}, + {"maciceland"}, + {"mac", "macintosh", "macroman", "csmacintosh"}, + {"macromania"}, + {"macthai"}, + {"macturkish"}, + {"macukraine"}, + {"ms-kanji", "shift-jis", "shift-jis", "sjis", "csshiftjis"}, + {" MS-Windows", "Japanese", "(cp932)"}, + {"mulelao-1"}, + {"nextstep"}, + {"riscos-latin1"}, + {"shift-jis-2004", "shift-jisx0213"}, + {"tcvn", "tcvn-5712", "tcvn5712-1", "tcvn5712-1:1993"}, + {"ucs-2be", "unicode-1-1", "unicodebig", "csunicode11"}, + {"ucs-2-internal"}, + {"ucs-2le", "unicodelittle"}, + {"ucs-2-swapped"}, + {"ucs-4be"}, + {"ucs-4-internal"}, + {"ucs-4le"}, + {"ucs-4-swapped"}, + {"unicode-1-1-utf-7", "utf-7", "csunicode11utf7"}, + {"utf-16"}, + {"utf-16be"}, + {"utf-16le"}, + {"utf-32"}, + {"utf-32be"}, + {"utf-32le"}, + {"utf-8"}, + {"utf-8-mac", "utf8-mac"}, + {"viscii", "viscii1.1-1", "csviscii"}, + {"windows-31j", "cp932"}, +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/local.go b/vendor/github.com/paulrosania/go-charset/charset/local.go new file mode 100644 index 00000000..9776b962 --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/local.go @@ -0,0 +1,162 @@ +package charset + +import ( + "encoding/json" + "fmt" + "os" + "sync" +) + +var ( + readLocalCharsetsOnce sync.Once + localCharsets = make(map[string]*localCharset) +) + +type localCharset struct { + Charset + arg string + *class +} + +// A class of character sets. +// Each class can be instantiated with an argument specified in the config file. +// Many character sets can use a single class. +type class struct { + from, to func(arg string) (Translator, error) +} + +// The set of classes, indexed by class name. +var classes = make(map[string]*class) + +func registerClass(charset string, from, to func(arg string) (Translator, error)) { + classes[charset] = &class{from, to} +} + +type localFactory struct{} + +func (f localFactory) TranslatorFrom(name string) (Translator, error) { + f.init() + name = NormalizedName(name) + cs := localCharsets[name] + if cs == nil { + return nil, fmt.Errorf("character set %q not found", name) + } + if cs.from == nil { + return nil, fmt.Errorf("cannot translate from %q", name) + } + return cs.from(cs.arg) +} + +func (f localFactory) TranslatorTo(name string) (Translator, error) { + f.init() + name = NormalizedName(name) + cs := localCharsets[name] + if cs == nil { + return nil, fmt.Errorf("character set %q not found", name) + } + if cs.to == nil { + return nil, fmt.Errorf("cannot translate to %q", name) + } + return cs.to(cs.arg) +} + +func (f localFactory) Names() []string { + f.init() + var names []string + for name, cs := range localCharsets { + // add names only for non-aliases. + if localCharsets[cs.Name] == cs { + names = append(names, name) + } + } + return names +} + +func (f localFactory) Info(name string) *Charset { + f.init() + lcs := localCharsets[NormalizedName(name)] + if lcs == nil { + return nil + } + // copy the charset info so that callers can't mess with it. + cs := lcs.Charset + return &cs +} + +func (f localFactory) init() { + readLocalCharsetsOnce.Do(readLocalCharsets) +} + +// charsetEntry is the data structure for one entry in the JSON config file. +// If Alias is non-empty, it should be the canonical name of another +// character set; otherwise Class should be the name +// of an entry in classes, and Arg is the argument for +// instantiating it. +type charsetEntry struct { + Aliases []string + Desc string + Class string + Arg string +} + +// readCharsets reads the JSON config file. +// It's done once only, when first needed. +func readLocalCharsets() { + csdata, err := readFile("charsets.json") + if err != nil { + fmt.Fprintf(os.Stderr, "charset: cannot open \"charsets.json\": %v\n", err) + return + } + + var entries map[string]charsetEntry + err = json.Unmarshal(csdata, &entries) + if err != nil { + fmt.Fprintf(os.Stderr, "charset: cannot decode config file: %v\n", err) + } + for name, e := range entries { + class := classes[e.Class] + if class == nil { + continue + } + name = NormalizedName(name) + for i, a := range e.Aliases { + e.Aliases[i] = NormalizedName(a) + } + cs := &localCharset{ + Charset: Charset{ + Name: name, + Aliases: e.Aliases, + Desc: e.Desc, + NoFrom: class.from == nil, + NoTo: class.to == nil, + }, + arg: e.Arg, + class: class, + } + localCharsets[cs.Name] = cs + for _, a := range cs.Aliases { + localCharsets[a] = cs + } + } +} + +// A general cache store that local character set translators +// can use for persistent storage of data. +var ( + cacheMutex sync.Mutex + cacheStore = make(map[interface{}]interface{}) +) + +func cache(key interface{}, f func() (interface{}, error)) (interface{}, error) { + cacheMutex.Lock() + defer cacheMutex.Unlock() + if x := cacheStore[key]; x != nil { + return x, nil + } + x, err := f() + if err != nil { + return nil, err + } + cacheStore[key] = x + return x, err +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/utf16.go b/vendor/github.com/paulrosania/go-charset/charset/utf16.go new file mode 100644 index 00000000..ebde794c --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/utf16.go @@ -0,0 +1,110 @@ +package charset + +import ( + "encoding/binary" + "errors" + "unicode/utf8" +) + +func init() { + registerClass("utf16", fromUTF16, toUTF16) +} + +type translateFromUTF16 struct { + first bool + endian binary.ByteOrder + scratch []byte +} + +func (p *translateFromUTF16) Translate(data []byte, eof bool) (int, []byte, error) { + data = data[0 : len(data)&^1] // round to even number of bytes. + if len(data) < 2 { + return 0, nil, nil + } + n := 0 + if p.first && p.endian == nil { + switch binary.BigEndian.Uint16(data) { + case 0xfeff: + p.endian = binary.BigEndian + data = data[2:] + n += 2 + case 0xfffe: + p.endian = binary.LittleEndian + data = data[2:] + n += 2 + default: + p.endian = guessEndian(data) + } + p.first = false + } + + p.scratch = p.scratch[:0] + for ; len(data) > 0; data = data[2:] { + p.scratch = appendRune(p.scratch, rune(p.endian.Uint16(data))) + n += 2 + } + return n, p.scratch, nil +} + +func guessEndian(data []byte) binary.ByteOrder { + // XXX TODO + return binary.LittleEndian +} + +type translateToUTF16 struct { + first bool + endian binary.ByteOrder + scratch []byte +} + +func (p *translateToUTF16) Translate(data []byte, eof bool) (int, []byte, error) { + p.scratch = ensureCap(p.scratch[:0], (len(data)+1)*2) + if p.first { + p.scratch = p.scratch[0:2] + p.endian.PutUint16(p.scratch, 0xfeff) + p.first = false + } + n := 0 + for len(data) > 0 { + if !utf8.FullRune(data) && !eof { + break + } + r, size := utf8.DecodeRune(data) + // TODO if r > 65535? + + slen := len(p.scratch) + p.scratch = p.scratch[0 : slen+2] + p.endian.PutUint16(p.scratch[slen:], uint16(r)) + data = data[size:] + n += size + } + return n, p.scratch, nil +} + +func getEndian(arg string) (binary.ByteOrder, error) { + switch arg { + case "le": + return binary.LittleEndian, nil + case "be": + return binary.BigEndian, nil + case "": + return nil, nil + } + return nil, errors.New("charset: unknown utf16 endianness") +} + +func fromUTF16(arg string) (Translator, error) { + endian, err := getEndian(arg) + if err != nil { + return nil, err + } + return &translateFromUTF16{first: true, endian: endian}, nil +} + +func toUTF16(arg string) (Translator, error) { + endian, err := getEndian(arg) + if err != nil { + return nil, err + } + return &translateToUTF16{first: false, endian: endian}, nil +} diff --git a/vendor/github.com/paulrosania/go-charset/charset/utf8.go b/vendor/github.com/paulrosania/go-charset/charset/utf8.go new file mode 100644 index 00000000..23980b33 --- /dev/null +++ b/vendor/github.com/paulrosania/go-charset/charset/utf8.go @@ -0,0 +1,51 @@ +package charset + +import ( + "unicode/utf8" +) + +func init() { + registerClass("utf8", toUTF8, toUTF8) +} + +type translateToUTF8 struct { + scratch []byte +} + +var errorBytes = []byte(string(utf8.RuneError)) + +const errorRuneLen = len(string(utf8.RuneError)) + +func (p *translateToUTF8) Translate(data []byte, eof bool) (int, []byte, error) { + p.scratch = ensureCap(p.scratch, (len(data))*errorRuneLen) + buf := p.scratch[:0] + for i := 0; i < len(data); { + // fast path for ASCII + if b := data[i]; b < utf8.RuneSelf { + buf = append(buf, b) + i++ + continue + } + _, size := utf8.DecodeRune(data[i:]) + if size == 1 { + if !eof && !utf8.FullRune(data) { + // When DecodeRune has converted only a single + // byte, we know there must be some kind of error + // because we know the byte's not ASCII. + // If we aren't at EOF, and it's an incomplete + // rune encoding, then we return to process + // the final bytes in a subsequent call. + return i, buf, nil + } + buf = append(buf, errorBytes...) + } else { + buf = append(buf, data[i:i+size]...) + } + i += size + } + return len(data), buf, nil +} + +func toUTF8(arg string) (Translator, error) { + return new(translateToUTF8), nil +} |