summaryrefslogblamecommitdiffstats
path: root/vendor/golang.org/x/text/runes/runes.go
blob: 71933696f592a5501e1a3572507a857a830cd1fb (plain) (tree)

































































































































































































































































































































































                                                                                              
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package runes provide transforms for UTF-8 encoded text.
package runes // import "golang.org/x/text/runes"

import (
	"unicode"
	"unicode/utf8"

	"golang.org/x/text/transform"
)

// A Set is a collection of runes.
type Set interface {
	// Contains returns true if r is contained in the set.
	Contains(r rune) bool
}

type setFunc func(rune) bool

func (s setFunc) Contains(r rune) bool {
	return s(r)
}

// Note: using funcs here instead of wrapping types result in cleaner
// documentation and a smaller API.

// In creates a Set with a Contains method that returns true for all runes in
// the given RangeTable.
func In(rt *unicode.RangeTable) Set {
	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
}

// In creates a Set with a Contains method that returns true for all runes not
// in the given RangeTable.
func NotIn(rt *unicode.RangeTable) Set {
	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
}

// Predicate creates a Set with a Contains method that returns f(r).
func Predicate(f func(rune) bool) Set {
	return setFunc(f)
}

// Transformer implements the transform.Transformer interface.
type Transformer struct {
	t transform.SpanningTransformer
}

func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	return t.t.Transform(dst, src, atEOF)
}

func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
	return t.t.Span(b, atEOF)
}

func (t Transformer) Reset() { t.t.Reset() }

// Bytes returns a new byte slice with the result of converting b using t.  It
// calls Reset on t. It returns nil if any error was found. This can only happen
// if an error-producing Transformer is passed to If.
func (t Transformer) Bytes(b []byte) []byte {
	b, _, err := transform.Bytes(t, b)
	if err != nil {
		return nil
	}
	return b
}

// String returns a string with the result of converting s using t. It calls
// Reset on t. It returns the empty string if any error was found. This can only
// happen if an error-producing Transformer is passed to If.
func (t Transformer) String(s string) string {
	s, _, err := transform.String(t, s)
	if err != nil {
		return ""
	}
	return s
}

// TODO:
// - Copy: copying strings and bytes in whole-rune units.
// - Validation (maybe)
// - Well-formed-ness (maybe)

const runeErrorString = string(utf8.RuneError)

// Remove returns a Transformer that removes runes r for which s.Contains(r).
// Illegal input bytes are replaced by RuneError before being passed to f.
func Remove(s Set) Transformer {
	if f, ok := s.(setFunc); ok {
		// This little trick cuts the running time of BenchmarkRemove for sets
		// created by Predicate roughly in half.
		// TODO: special-case RangeTables as well.
		return Transformer{remove(f)}
	}
	return Transformer{remove(s.Contains)}
}

// TODO: remove transform.RemoveFunc.

type remove func(r rune) bool

func (remove) Reset() {}

// Span implements transform.Spanner.
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
	for r, size := rune(0), 0; n < len(src); {
		if r = rune(src[n]); r < utf8.RuneSelf {
			size = 1
		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
			// Invalid rune.
			if !atEOF && !utf8.FullRune(src[n:]) {
				err = transform.ErrShortSrc
			} else {
				err = transform.ErrEndOfSpan
			}
			break
		}
		if t(r) {
			err = transform.ErrEndOfSpan
			break
		}
		n += size
	}
	return
}

// Transform implements transform.Transformer.
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	for r, size := rune(0), 0; nSrc < len(src); {
		if r = rune(src[nSrc]); r < utf8.RuneSelf {
			size = 1
		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
			// Invalid rune.
			if !atEOF && !utf8.FullRune(src[nSrc:]) {
				err = transform.ErrShortSrc
				break
			}
			// We replace illegal bytes with RuneError. Not doing so might
			// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
			// The resulting byte sequence may subsequently contain runes
			// for which t(r) is true that were passed unnoticed.
			if !t(utf8.RuneError) {
				if nDst+3 > len(dst) {
					err = transform.ErrShortDst
					break
				}
				dst[nDst+0] = runeErrorString[0]
				dst[nDst+1] = runeErrorString[1]
				dst[nDst+2] = runeErrorString[2]
				nDst += 3
			}
			nSrc++
			continue
		}
		if t(r) {
			nSrc += size
			continue
		}
		if nDst+size > len(dst) {
			err = transform.ErrShortDst
			break
		}
		for i := 0; i < size; i++ {
			dst[nDst] = src[nSrc]
			nDst++
			nSrc++
		}
	}
	return
}

// Map returns a Transformer that maps the runes in the input using the given
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
// being passed to the mapping func.
func Map(mapping func(rune) rune) Transformer {
	return Transformer{mapper(mapping)}
}

type mapper func(rune) rune

func (mapper) Reset() {}

// Span implements transform.Spanner.
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
	for r, size := rune(0), 0; n < len(src); n += size {
		if r = rune(src[n]); r < utf8.RuneSelf {
			size = 1
		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
			// Invalid rune.
			if !atEOF && !utf8.FullRune(src[n:]) {
				err = transform.ErrShortSrc
			} else {
				err = transform.ErrEndOfSpan
			}
			break
		}
		if t(r) != r {
			err = transform.ErrEndOfSpan
			break
		}
	}
	return n, err
}

// Transform implements transform.Transformer.
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	var replacement rune
	var b [utf8.UTFMax]byte

	for r, size := rune(0), 0; nSrc < len(src); {
		if r = rune(src[nSrc]); r < utf8.RuneSelf {
			if replacement = t(r); replacement < utf8.RuneSelf {
				if nDst == len(dst) {
					err = transform.ErrShortDst
					break
				}
				dst[nDst] = byte(replacement)
				nDst++
				nSrc++
				continue
			}
			size = 1
		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
			// Invalid rune.
			if !atEOF && !utf8.FullRune(src[nSrc:]) {
				err = transform.ErrShortSrc
				break
			}

			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
				if nDst+3 > len(dst) {
					err = transform.ErrShortDst
					break
				}
				dst[nDst+0] = runeErrorString[0]
				dst[nDst+1] = runeErrorString[1]
				dst[nDst+2] = runeErrorString[2]
				nDst += 3
				nSrc++
				continue
			}
		} else if replacement = t(r); replacement == r {
			if nDst+size > len(dst) {
				err = transform.ErrShortDst
				break
			}
			for i := 0; i < size; i++ {
				dst[nDst] = src[nSrc]
				nDst++
				nSrc++
			}
			continue
		}

		n := utf8.EncodeRune(b[:], replacement)

		if nDst+n > len(dst) {
			err = transform.ErrShortDst
			break
		}
		for i := 0; i < n; i++ {
			dst[nDst] = b[i]
			nDst++
		}
		nSrc += size
	}
	return
}

// ReplaceIllFormed returns a transformer that replaces all input bytes that are
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
func ReplaceIllFormed() Transformer {
	return Transformer{&replaceIllFormed{}}
}

type replaceIllFormed struct{ transform.NopResetter }

func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
	for n < len(src) {
		// ASCII fast path.
		if src[n] < utf8.RuneSelf {
			n++
			continue
		}

		r, size := utf8.DecodeRune(src[n:])

		// Look for a valid non-ASCII rune.
		if r != utf8.RuneError || size != 1 {
			n += size
			continue
		}

		// Look for short source data.
		if !atEOF && !utf8.FullRune(src[n:]) {
			err = transform.ErrShortSrc
			break
		}

		// We have an invalid rune.
		err = transform.ErrEndOfSpan
		break
	}
	return n, err
}

func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	for nSrc < len(src) {
		// ASCII fast path.
		if r := src[nSrc]; r < utf8.RuneSelf {
			if nDst == len(dst) {
				err = transform.ErrShortDst
				break
			}
			dst[nDst] = r
			nDst++
			nSrc++
			continue
		}

		// Look for a valid non-ASCII rune.
		if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
			if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
				err = transform.ErrShortDst
				break
			}
			nDst += size
			nSrc += size
			continue
		}

		// Look for short source data.
		if !atEOF && !utf8.FullRune(src[nSrc:]) {
			err = transform.ErrShortSrc
			break
		}

		// We have an invalid rune.
		if nDst+3 > len(dst) {
			err = transform.ErrShortDst
			break
		}
		dst[nDst+0] = runeErrorString[0]
		dst[nDst+1] = runeErrorString[1]
		dst[nDst+2] = runeErrorString[2]
		nDst += 3
		nSrc++
	}
	return nDst, nSrc, err
}