summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/paulrosania/go-charset/charset/utf8.go
blob: 23980b334d96b277e601a0d17dfc2be870466998 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
package charset

import (
	"unicode/utf8"
)

func init() {
	registerClass("utf8", toUTF8, toUTF8)
}

type translateToUTF8 struct {
	scratch []byte
}

var errorBytes = []byte(string(utf8.RuneError))

const errorRuneLen = len(string(utf8.RuneError))

func (p *translateToUTF8) Translate(data []byte, eof bool) (int, []byte, error) {
	p.scratch = ensureCap(p.scratch, (len(data))*errorRuneLen)
	buf := p.scratch[:0]
	for i := 0; i < len(data); {
		// fast path for ASCII
		if b := data[i]; b < utf8.RuneSelf {
			buf = append(buf, b)
			i++
			continue
		}
		_, size := utf8.DecodeRune(data[i:])
		if size == 1 {
			if !eof && !utf8.FullRune(data) {
				// When DecodeRune has converted only a single
				// byte, we know there must be some kind of error
				// because we know the byte's not ASCII.
				// If we aren't at EOF, and it's an incomplete
				// rune encoding, then we return to process
				// the final bytes in a subsequent call.
				return i, buf, nil
			}
			buf = append(buf, errorBytes...)
		} else {
			buf = append(buf, data[i:i+size]...)
		}
		i += size
	}
	return len(data), buf, nil
}

func toUTF8(arg string) (Translator, error) {
	return new(translateToUTF8), nil
}