diff --git a/kernel/util/string.go b/kernel/util/string.go index d982cfcf6..569a4a0f4 100644 --- a/kernel/util/string.go +++ b/kernel/util/string.go @@ -17,7 +17,9 @@ package util import ( + "bytes" "strings" + "unicode" "github.com/88250/lute/html" ) @@ -39,3 +41,30 @@ func Reverse(s string) string { } return string(runes) } + +func RemoveRedundantSpace(str string) string { + buf := bytes.Buffer{} + lastIsChinese := false + lastIsSpace := false + for _, r := range str { + if unicode.IsSpace(r) { + if lastIsChinese || lastIsSpace { + continue + } + buf.WriteRune(' ') + lastIsChinese = false + lastIsSpace = true + continue + } + + lastIsSpace = false + buf.WriteRune(r) + if unicode.Is(unicode.Han, r) { + lastIsChinese = true + continue + } else { + lastIsChinese = false + } + } + return buf.String() +} diff --git a/kernel/util/tesseract.go b/kernel/util/tesseract.go index bc73ee200..04780bd83 100644 --- a/kernel/util/tesseract.go +++ b/kernel/util/tesseract.go @@ -23,7 +23,6 @@ import ( "os" "os/exec" "path/filepath" - "regexp" "strings" "sync" "time" @@ -90,8 +89,7 @@ func Tesseract(imgAbsPath string) string { ret := string(output) ret = gulu.Str.RemoveInvisible(ret) - reg := regexp.MustCompile("\\s{2,}") - ret = reg.ReplaceAllString(ret, " ") + ret = RemoveRedundantSpace(ret) msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret) PushStatusBar(msg) return ret