summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/rivo/uniseg/grapheme.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/rivo/uniseg/grapheme.go')
-rw-r--r--vendor/github.com/rivo/uniseg/grapheme.go268
1 files changed, 268 insertions, 0 deletions
diff --git a/vendor/github.com/rivo/uniseg/grapheme.go b/vendor/github.com/rivo/uniseg/grapheme.go
new file mode 100644
index 00000000..207157f5
--- /dev/null
+++ b/vendor/github.com/rivo/uniseg/grapheme.go
@@ -0,0 +1,268 @@
+package uniseg
+
+import "unicode/utf8"
+
+// The states of the grapheme cluster parser.
+const (
+ grAny = iota
+ grCR
+ grControlLF
+ grL
+ grLVV
+ grLVTT
+ grPrepend
+ grExtendedPictographic
+ grExtendedPictographicZWJ
+ grRIOdd
+ grRIEven
+)
+
+// The grapheme cluster parser's breaking instructions.
+const (
+ grNoBoundary = iota
+ grBoundary
+)
+
+// The grapheme cluster parser's state transitions. Maps (state, property) to
+// (new state, breaking instruction, rule number). The breaking instruction
+// always refers to the boundary between the last and next code point.
+//
+// This map is queried as follows:
+//
+// 1. Find specific state + specific property. Stop if found.
+// 2. Find specific state + any property.
+// 3. Find any state + specific property.
+// 4. If only (2) or (3) (but not both) was found, stop.
+// 5. If both (2) and (3) were found, use state and breaking instruction from
+// the transition with the lower rule number, prefer (3) if rule numbers
+// are equal. Stop.
+// 6. Assume grAny and grBoundary.
+var grTransitions = map[[2]int][3]int{
+ // GB5
+ {grAny, prCR}: {grCR, grBoundary, 50},
+ {grAny, prLF}: {grControlLF, grBoundary, 50},
+ {grAny, prControl}: {grControlLF, grBoundary, 50},
+
+ // GB4
+ {grCR, prAny}: {grAny, grBoundary, 40},
+ {grControlLF, prAny}: {grAny, grBoundary, 40},
+
+ // GB3.
+ {grCR, prLF}: {grAny, grNoBoundary, 30},
+
+ // GB6.
+ {grAny, prL}: {grL, grBoundary, 9990},
+ {grL, prL}: {grL, grNoBoundary, 60},
+ {grL, prV}: {grLVV, grNoBoundary, 60},
+ {grL, prLV}: {grLVV, grNoBoundary, 60},
+ {grL, prLVT}: {grLVTT, grNoBoundary, 60},
+
+ // GB7.
+ {grAny, prLV}: {grLVV, grBoundary, 9990},
+ {grAny, prV}: {grLVV, grBoundary, 9990},
+ {grLVV, prV}: {grLVV, grNoBoundary, 70},
+ {grLVV, prT}: {grLVTT, grNoBoundary, 70},
+
+ // GB8.
+ {grAny, prLVT}: {grLVTT, grBoundary, 9990},
+ {grAny, prT}: {grLVTT, grBoundary, 9990},
+ {grLVTT, prT}: {grLVTT, grNoBoundary, 80},
+
+ // GB9.
+ {grAny, prExtend}: {grAny, grNoBoundary, 90},
+ {grAny, prZWJ}: {grAny, grNoBoundary, 90},
+
+ // GB9a.
+ {grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
+
+ // GB9b.
+ {grAny, prPreprend}: {grPrepend, grBoundary, 9990},
+ {grPrepend, prAny}: {grAny, grNoBoundary, 92},
+
+ // GB11.
+ {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
+ {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
+ {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
+ {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
+
+ // GB12 / GB13.
+ {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
+ {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
+ {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
+}
+
+// Graphemes implements an iterator over Unicode extended grapheme clusters,
+// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
+// "user-perceived characters". These characters often consist of multiple
+// code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
+// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
+// woman) and the rules described in Annex #29 must be applied to group those
+// code points into clusters perceived by the user as one character.
+type Graphemes struct {
+ // The code points over which this class iterates.
+ codePoints []rune
+
+ // The (byte-based) indices of the code points into the original string plus
+ // len(original string). Thus, len(indices) = len(codePoints) + 1.
+ indices []int
+
+ // The current grapheme cluster to be returned. These are indices into
+ // codePoints/indices. If start == end, we either haven't started iterating
+ // yet (0) or the iteration has already completed (1).
+ start, end int
+
+ // The index of the next code point to be parsed.
+ pos int
+
+ // The current state of the code point parser.
+ state int
+}
+
+// NewGraphemes returns a new grapheme cluster iterator.
+func NewGraphemes(s string) *Graphemes {
+ l := utf8.RuneCountInString(s)
+ codePoints := make([]rune, l)
+ indices := make([]int, l+1)
+ i := 0
+ for pos, r := range s {
+ codePoints[i] = r
+ indices[i] = pos
+ i++
+ }
+ indices[l] = len(s)
+ g := &Graphemes{
+ codePoints: codePoints,
+ indices: indices,
+ }
+ g.Next() // Parse ahead.
+ return g
+}
+
+// Next advances the iterator by one grapheme cluster and returns false if no
+// clusters are left. This function must be called before the first cluster is
+// accessed.
+func (g *Graphemes) Next() bool {
+ g.start = g.end
+
+ // The state transition gives us a boundary instruction BEFORE the next code
+ // point so we always need to stay ahead by one code point.
+
+ // Parse the next code point.
+ for g.pos <= len(g.codePoints) {
+ // GB2.
+ if g.pos == len(g.codePoints) {
+ g.end = g.pos
+ g.pos++
+ break
+ }
+
+ // Determine the property of the next character.
+ nextProperty := property(g.codePoints[g.pos])
+ g.pos++
+
+ // Find the applicable transition.
+ var boundary bool
+ transition, ok := grTransitions[[2]int{g.state, nextProperty}]
+ if ok {
+ // We have a specific transition. We'll use it.
+ g.state = transition[0]
+ boundary = transition[1] == grBoundary
+ } else {
+ // No specific transition found. Try the less specific ones.
+ transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
+ transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
+ if okAnyProp && okAnyState {
+ // Both apply. We'll use a mix (see comments for grTransitions).
+ g.state = transAnyState[0]
+ boundary = transAnyState[1] == grBoundary
+ if transAnyProp[2] < transAnyState[2] {
+ g.state = transAnyProp[0]
+ boundary = transAnyProp[1] == grBoundary
+ }
+ } else if okAnyProp {
+ // We only have a specific state.
+ g.state = transAnyProp[0]
+ boundary = transAnyProp[1] == grBoundary
+ // This branch will probably never be reached because okAnyState will
+ // always be true given the current transition map. But we keep it here
+ // for future modifications to the transition map where this may not be
+ // true anymore.
+ } else if okAnyState {
+ // We only have a specific property.
+ g.state = transAnyState[0]
+ boundary = transAnyState[1] == grBoundary
+ } else {
+ // No known transition. GB999: Any x Any.
+ g.state = grAny
+ boundary = true
+ }
+ }
+
+ // If we found a cluster boundary, let's stop here. The current cluster will
+ // be the one that just ended.
+ if g.pos-1 == 0 /* GB1 */ || boundary {
+ g.end = g.pos - 1
+ break
+ }
+ }
+
+ return g.start != g.end
+}
+
+// Runes returns a slice of runes (code points) which corresponds to the current
+// grapheme cluster. If the iterator is already past the end or Next() has not
+// yet been called, nil is returned.
+func (g *Graphemes) Runes() []rune {
+ if g.start == g.end {
+ return nil
+ }
+ return g.codePoints[g.start:g.end]
+}
+
+// Str returns a substring of the original string which corresponds to the
+// current grapheme cluster. If the iterator is already past the end or Next()
+// has not yet been called, an empty string is returned.
+func (g *Graphemes) Str() string {
+ if g.start == g.end {
+ return ""
+ }
+ return string(g.codePoints[g.start:g.end])
+}
+
+// Bytes returns a byte slice which corresponds to the current grapheme cluster.
+// If the iterator is already past the end or Next() has not yet been called,
+// nil is returned.
+func (g *Graphemes) Bytes() []byte {
+ if g.start == g.end {
+ return nil
+ }
+ return []byte(string(g.codePoints[g.start:g.end]))
+}
+
+// Positions returns the interval of the current grapheme cluster as byte
+// positions into the original string. The first returned value "from" indexes
+// the first byte and the second returned value "to" indexes the first byte that
+// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
+// the original string "str". If Next() has not yet been called, both values are
+// 0. If the iterator is already past the end, both values are 1.
+func (g *Graphemes) Positions() (int, int) {
+ return g.indices[g.start], g.indices[g.end]
+}
+
+// Reset puts the iterator into its initial state such that the next call to
+// Next() sets it to the first grapheme cluster again.
+func (g *Graphemes) Reset() {
+ g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
+ g.Next() // Parse ahead again.
+}
+
+// GraphemeClusterCount returns the number of user-perceived characters
+// (grapheme clusters) for the given string. To calculate this number, it
+// iterates through the string using the Graphemes iterator.
+func GraphemeClusterCount(s string) (n int) {
+ g := NewGraphemes(s)
+ for g.Next() {
+ n++
+ }
+ return
+}