stop.go 1.93 KB
Newer Older
Jesse Gross's avatar
Jesse Gross committed
1
package common
2
3
4
5
6

import (
	"strings"
)

Jesse Gross's avatar
Jesse Gross committed
7
func FindStop(sequence string, stops []string) (bool, string) {
8
9
10
11
12
13
14
15
16
	for _, stop := range stops {
		if strings.Contains(sequence, stop) {
			return true, stop
		}
	}

	return false, ""
}

Jesse Gross's avatar
Jesse Gross committed
17
func ContainsStopSuffix(sequence string, stops []string) bool {
18
19
20
21
22
23
24
25
26
27
28
	for _, stop := range stops {
		for i := 1; i <= len(stop); i++ {
			if strings.HasSuffix(sequence, stop[:i]) {
				return true
			}
		}
	}

	return false
}

29
// TruncateStop removes the provided stop string from pieces,
30
// returning the partial pieces with stop removed, including truncating
31
// the last piece if required (and signalling if this was the case)
Jesse Gross's avatar
Jesse Gross committed
32
func TruncateStop(pieces []string, stop string) ([]string, bool) {
33
34
35
36
	joined := strings.Join(pieces, "")

	index := strings.Index(joined, stop)
	if index == -1 {
37
		return pieces, false
38
39
40
41
42
43
44
45
46
47
48
	}

	joined = joined[:index]

	// Split truncated string back into pieces of original lengths
	lengths := make([]int, len(pieces))
	for i, piece := range pieces {
		lengths[i] = len(piece)
	}

	var result []string
49
	tokenTruncated := false
50
51
52
53
54
55
56
57
58
	start := 0
	for _, length := range lengths {
		if start >= len(joined) {
			break
		}

		end := start + length
		if end > len(joined) {
			end = len(joined)
59
			tokenTruncated = true
60
61
62
63
64
		}
		result = append(result, joined[start:end])
		start = end
	}

65
	return result, tokenTruncated
66
67
}

Jesse Gross's avatar
Jesse Gross committed
68
func IncompleteUnicode(token string) bool {
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
	incomplete := false

	// check if there is incomplete UTF-8 character at the end
	for i := 1; i < 5 && i <= len(token); i++ {
		c := token[len(token)-i]

		if (c & 0xc0) == 0x80 {
			// continuation byte: 10xxxxxx
			continue
		}

		if (c & 0xe0) == 0xc0 {
			// 2-byte character: 110xxxxx ...
			incomplete = i < 2
		} else if (c & 0xf0) == 0xe0 {
			// 3-byte character: 1110xxxx ...
			incomplete = i < 3
		} else if (c & 0xf8) == 0xf0 {
			// 4-byte character: 11110xxx ...
			incomplete = i < 4
		}

		// else 1-byte character or invalid byte
		break
	}

	return incomplete
}