🎨 改进图片 OCR 提取文本结果中的多余字符 Fix https://github.com/siyuan-note/siyuan/issues/7109

2025-12-16 14:40:12 +01:00 · 2023-01-18 11:46:51 +08:00 · 2023-01-18 11:46:51 +08:00 · 519f015498
commit 519f015498
parent 2ffec98b71
2 changed files with 30 additions and 3 deletions
--- a/kernel/util/string.go
+++ b/kernel/util/string.go
@ -17,7 +17,9 @@
 package util

 import (
+	"bytes"
 	"strings"
+	"unicode"

 	"github.com/88250/lute/html"
 )
@ -39,3 +41,30 @@ func Reverse(s string) string {
 	}
 	return string(runes)
 }
+
+func RemoveRedundantSpace(str string) string {
+	buf := bytes.Buffer{}
+	lastIsChinese := false
+	lastIsSpace := false
+	for _, r := range str {
+		if unicode.IsSpace(r) {
+			if lastIsChinese || lastIsSpace {
+				continue
+			}
+			buf.WriteRune(' ')
+			lastIsChinese = false
+			lastIsSpace = true
+			continue
+		}
+
+		lastIsSpace = false
+		buf.WriteRune(r)
+		if unicode.Is(unicode.Han, r) {
+			lastIsChinese = true
+			continue
+		} else {
+			lastIsChinese = false
+		}
+	}
+	return buf.String()
+}
--- a/kernel/util/tesseract.go
+++ b/kernel/util/tesseract.go
@ -23,7 +23,6 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
-	"regexp"
 	"strings"
 	"sync"
 	"time"
@ -90,8 +89,7 @@ func Tesseract(imgAbsPath string) string {

 	ret := string(output)
 	ret = gulu.Str.RemoveInvisible(ret)
-	reg := regexp.MustCompile("\\s{2,}")
-	ret = reg.ReplaceAllString(ret, " ")
+	ret = RemoveRedundantSpace(ret)
 	msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret)
 	PushStatusBar(msg)
 	return ret