From 519f0154986fb71ec5f99695177c68ec844dde8a Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Wed, 18 Jan 2023 11:46:51 +0800 Subject: [PATCH] =?UTF-8?q?:art:=20=E6=94=B9=E8=BF=9B=E5=9B=BE=E7=89=87=20?= =?UTF-8?q?OCR=20=E6=8F=90=E5=8F=96=E6=96=87=E6=9C=AC=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=E5=A4=9A=E4=BD=99=E5=AD=97=E7=AC=A6=20Fix=20?= =?UTF-8?q?https://github.com/siyuan-note/siyuan/issues/7109?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/util/string.go | 29 +++++++++++++++++++++++++++++ kernel/util/tesseract.go | 4 +--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/kernel/util/string.go b/kernel/util/string.go index d982cfcf6..569a4a0f4 100644 --- a/kernel/util/string.go +++ b/kernel/util/string.go @@ -17,7 +17,9 @@ package util import ( + "bytes" "strings" + "unicode" "github.com/88250/lute/html" ) @@ -39,3 +41,30 @@ func Reverse(s string) string { } return string(runes) } + +func RemoveRedundantSpace(str string) string { + buf := bytes.Buffer{} + lastIsChinese := false + lastIsSpace := false + for _, r := range str { + if unicode.IsSpace(r) { + if lastIsChinese || lastIsSpace { + continue + } + buf.WriteRune(' ') + lastIsChinese = false + lastIsSpace = true + continue + } + + lastIsSpace = false + buf.WriteRune(r) + if unicode.Is(unicode.Han, r) { + lastIsChinese = true + continue + } else { + lastIsChinese = false + } + } + return buf.String() +} diff --git a/kernel/util/tesseract.go b/kernel/util/tesseract.go index bc73ee200..04780bd83 100644 --- a/kernel/util/tesseract.go +++ b/kernel/util/tesseract.go @@ -23,7 +23,6 @@ import ( "os" "os/exec" "path/filepath" - "regexp" "strings" "sync" "time" @@ -90,8 +89,7 @@ func Tesseract(imgAbsPath string) string { ret := string(output) ret = gulu.Str.RemoveInvisible(ret) - reg := regexp.MustCompile("\\s{2,}") - ret = reg.ReplaceAllString(ret, " ") + ret = RemoveRedundantSpace(ret) msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret) PushStatusBar(msg) return ret