🎨 改进图片 OCR 提取文本结果中的多余字符 Fix https://github.com/siyuan-note/siyuan/issues/7109

This commit is contained in:
Liang Ding 2023-01-18 11:46:51 +08:00
parent 2ffec98b71
commit 519f015498
No known key found for this signature in database
GPG key ID: 136F30F901A2231D
2 changed files with 30 additions and 3 deletions

View file

@ -17,7 +17,9 @@
package util
import (
"bytes"
"strings"
"unicode"
"github.com/88250/lute/html"
)
@ -39,3 +41,30 @@ func Reverse(s string) string {
}
return string(runes)
}
func RemoveRedundantSpace(str string) string {
buf := bytes.Buffer{}
lastIsChinese := false
lastIsSpace := false
for _, r := range str {
if unicode.IsSpace(r) {
if lastIsChinese || lastIsSpace {
continue
}
buf.WriteRune(' ')
lastIsChinese = false
lastIsSpace = true
continue
}
lastIsSpace = false
buf.WriteRune(r)
if unicode.Is(unicode.Han, r) {
lastIsChinese = true
continue
} else {
lastIsChinese = false
}
}
return buf.String()
}

View file

@ -23,7 +23,6 @@ import (
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
@ -90,8 +89,7 @@ func Tesseract(imgAbsPath string) string {
ret := string(output)
ret = gulu.Str.RemoveInvisible(ret)
reg := regexp.MustCompile("\\s{2,}")
ret = reg.ReplaceAllString(ret, " ")
ret = RemoveRedundantSpace(ret)
msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret)
PushStatusBar(msg)
return ret