mirror of
https://github.com/siyuan-note/siyuan.git
synced 2025-12-16 14:40:12 +01:00
🎨 改进图片 OCR 提取文本结果中的多余字符 Fix https://github.com/siyuan-note/siyuan/issues/7109
This commit is contained in:
parent
2ffec98b71
commit
519f015498
2 changed files with 30 additions and 3 deletions
|
|
@ -17,7 +17,9 @@
|
|||
package util
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/88250/lute/html"
|
||||
)
|
||||
|
|
@ -39,3 +41,30 @@ func Reverse(s string) string {
|
|||
}
|
||||
return string(runes)
|
||||
}
|
||||
|
||||
func RemoveRedundantSpace(str string) string {
|
||||
buf := bytes.Buffer{}
|
||||
lastIsChinese := false
|
||||
lastIsSpace := false
|
||||
for _, r := range str {
|
||||
if unicode.IsSpace(r) {
|
||||
if lastIsChinese || lastIsSpace {
|
||||
continue
|
||||
}
|
||||
buf.WriteRune(' ')
|
||||
lastIsChinese = false
|
||||
lastIsSpace = true
|
||||
continue
|
||||
}
|
||||
|
||||
lastIsSpace = false
|
||||
buf.WriteRune(r)
|
||||
if unicode.Is(unicode.Han, r) {
|
||||
lastIsChinese = true
|
||||
continue
|
||||
} else {
|
||||
lastIsChinese = false
|
||||
}
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,6 @@ import (
|
|||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
|
@ -90,8 +89,7 @@ func Tesseract(imgAbsPath string) string {
|
|||
|
||||
ret := string(output)
|
||||
ret = gulu.Str.RemoveInvisible(ret)
|
||||
reg := regexp.MustCompile("\\s{2,}")
|
||||
ret = reg.ReplaceAllString(ret, " ")
|
||||
ret = RemoveRedundantSpace(ret)
|
||||
msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret)
|
||||
PushStatusBar(msg)
|
||||
return ret
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue