Files
go-utils/stringcase/split.go
2025-06-23 23:26:59 +08:00

145 lines
3.7 KiB
Go

package stringcase
import (
"strings"
"unicode"
"unicode/utf8"
)
type runeInfo struct {
r rune
}
// Checks whether or not the rune represented by rInfo is a digit.
func (rInfo *runeInfo) isDigit() bool {
return unicode.IsDigit(rInfo.r)
}
// Checks whether or not the rune represented by rInfo is an uppercase rune.
func (rInfo *runeInfo) isUppercase() bool {
return unicode.IsUpper(rInfo.r)
}
// A reader designed for reading "CamelCase" strings.
type rdr struct {
input string // The data this reader operates on.
pos int // The position of this reader.
hasNextRune bool // A flag indicating if there's a next rune.
rdRune runeInfo // Information about the last rune that was read.
nxtRune runeInfo // Information about the next rune that's about to be read.
}
// Read the next rune from r.
func (r *rdr) readRune() {
r.rdRune = runeInfo{rune(r.input[r.pos])}
r.pos = r.pos + 1
r.hasNextRune = r.pos < len(r.input)
if r.hasNextRune {
r.nxtRune = runeInfo{rune(r.input[r.pos])}
}
}
// Undo the last rune from r.
func (r *rdr) unreadRune() {
r.pos = r.pos - 1
r.nxtRune = r.rdRune
r.rdRune = runeInfo{rune(r.input[r.pos])}
r.hasNextRune = true // NOTE: An undo operation means that there will be always a next rune.
}
// Verify if the word that's currently read by r is a word that should NOT be split.
// If noSplit contains a word that starts with the word that's currently read by r, this function returns true, false
// otherwise.
func (r *rdr) isNoSplitWord(sIdx int, noSplit []string) bool {
return ContainsFn(noSplit, r.input[sIdx:r.pos+1], func(got, want string) bool {
return strings.HasPrefix(got, want)
})
}
// Read the next part from r.
// Each word in noSplit (if provided) is treated as a word that shouldn't be split.
func (r *rdr) readNextPart(noSplit []string) string {
sIdx := r.pos
r.readRune()
if r.rdRune.isDigit() {
return r.readNumber(sIdx, noSplit)
}
return r.readWord(sIdx, noSplit)
}
// Read and return a number from r.
func (r *rdr) readNumber(sIdx int, noSplit []string) string {
if r.hasNextRune && r.nxtRune.isDigit() {
for r.hasNextRune && (r.nxtRune.isDigit() || r.isNoSplitWord(sIdx, noSplit)) {
r.readRune()
}
return r.input[sIdx:r.pos]
}
return r.input[sIdx:r.pos]
}
// Read and return a word from r.
func (r *rdr) readWord(sIdx int, noSplit []string) string {
if r.hasNextRune && r.nxtRune.isUppercase() {
for r.hasNextRune && (r.nxtRune.isUppercase() || r.isNoSplitWord(sIdx, noSplit)) {
r.readRune()
}
if r.hasNextRune && (!r.nxtRune.isUppercase() && !r.nxtRune.isDigit()) {
r.unreadRune()
}
return r.input[sIdx:r.pos]
}
for r.hasNextRune && (r.isNoSplitWord(sIdx, noSplit) || (!r.nxtRune.isUppercase() && !r.nxtRune.isDigit())) {
r.readRune()
}
return r.input[sIdx:r.pos]
}
// Split reads v treating it as a "CamelCase" and returns the different words.
// If v isn't a valid UTF-8 string, or when v is an empty string, a slice with one element (v) is returned.
// Each word in noSplit (if provided) is treated as a word that shouldn't be split.
func Split(input string, noSplit ...string) []string {
if !utf8.ValidString(input) || len(input) == 0 {
return []string{input}
}
output := make([]string, 0)
inputs := SplitByNonAlphanumeric(input)
for _, v := range inputs {
v = strings.TrimSpace(v)
if v == "" {
continue
}
output = append(output, split(v, noSplit...)...)
}
return output
}
func split(input string, noSplit ...string) []string {
if !utf8.ValidString(input) || len(input) == 0 {
return []string{input}
}
vRdr := &rdr{input: input}
output := make([]string, 0)
for vRdr.pos < len(input) {
part := vRdr.readNextPart(noSplit)
output = append(output, part)
}
return output
}