mirror of
https://github.com/siyuan-note/siyuan.git
synced 2025-12-17 15:10:12 +01:00
🎨 桌面端支持搜索图片 OCR 文本 https://github.com/siyuan-note/siyuan/issues/3470
This commit is contained in:
parent
6f51de0d12
commit
c4208596e4
13 changed files with 361 additions and 359 deletions
|
|
@ -1060,7 +1060,6 @@
|
||||||
"182": "Sharing document, please wait...",
|
"182": "Sharing document, please wait...",
|
||||||
"183": "Validating index document tree [%d/%d %s]",
|
"183": "Validating index document tree [%d/%d %s]",
|
||||||
"184": "Powered by <a href=\"https://b3log.org/siyuan\" target=\"_blank\">SiYuan</a>",
|
"184": "Powered by <a href=\"https://b3log.org/siyuan\" target=\"_blank\">SiYuan</a>",
|
||||||
"185": "Index verification complete",
|
"185": "Index verification complete"
|
||||||
"186": "Extracted text [%s] from asset [%s]"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1060,7 +1060,6 @@
|
||||||
"182": "Compartiendo documento, por favor espere...",
|
"182": "Compartiendo documento, por favor espere...",
|
||||||
"183": "Validando el árbol del documento de índice [%d/%d %s]",
|
"183": "Validando el árbol del documento de índice [%d/%d %s]",
|
||||||
"184": "Con la tecnología de <a href=\"https://b3log.org/siyuan\" target=\"_blank\">SiYuan</a>",
|
"184": "Con la tecnología de <a href=\"https://b3log.org/siyuan\" target=\"_blank\">SiYuan</a>",
|
||||||
"185": "Verificación de índice completada",
|
"185": "Verificación de índice completada"
|
||||||
"186": "Texto extraído [%s] del recurso [%s]"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1060,7 +1060,6 @@
|
||||||
"182": "Partage du document, veuillez patienter...",
|
"182": "Partage du document, veuillez patienter...",
|
||||||
"183": "Validation de l'arborescence du document d'index [%d/%d %s]",
|
"183": "Validation de l'arborescence du document d'index [%d/%d %s]",
|
||||||
"184": "Propulsé par <a href=\"https://b3log.org/siyuan\" target=\"_blank\">SiYuan</a>",
|
"184": "Propulsé par <a href=\"https://b3log.org/siyuan\" target=\"_blank\">SiYuan</a>",
|
||||||
"185": "Vérification de l'index terminée",
|
"185": "Vérification de l'index terminée"
|
||||||
"186": "Texte extrait [%s] de l'actif [%s]"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -31,8 +31,8 @@
|
||||||
"leftRightLayout": "左右佈局",
|
"leftRightLayout": "左右佈局",
|
||||||
"topBottomLayout": "上下佈局",
|
"topBottomLayout": "上下佈局",
|
||||||
"keyword": "關鍵字",
|
"keyword": "關鍵字",
|
||||||
"searchMethod":"搜索方式",
|
"searchMethod": "搜索方式",
|
||||||
"regex":"正則表達式",
|
"regex": "正則表達式",
|
||||||
"keywordsLimit": "關鍵字數量限制",
|
"keywordsLimit": "關鍵字數量限制",
|
||||||
"exportAsImage": "導出為圖片",
|
"exportAsImage": "導出為圖片",
|
||||||
"exportBySiYuan": "由思源筆記導出",
|
"exportBySiYuan": "由思源筆記導出",
|
||||||
|
|
@ -1060,7 +1060,6 @@
|
||||||
"182": "正在分享文檔,請稍等...",
|
"182": "正在分享文檔,請稍等...",
|
||||||
"183": "正在校驗索引文檔樹 [%d/%d %s]",
|
"183": "正在校驗索引文檔樹 [%d/%d %s]",
|
||||||
"184": "由<a href=\"https://b3log.org/siyuan\" target=\"_blank\">思源筆記</a>強力驅動",
|
"184": "由<a href=\"https://b3log.org/siyuan\" target=\"_blank\">思源筆記</a>強力驅動",
|
||||||
"185": "索引校驗完畢",
|
"185": "索引校驗完畢"
|
||||||
"186": "已提取資源文件 [%s] 圖片中的文本 [%s]"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1060,7 +1060,6 @@
|
||||||
"182": "正在分享文档,请稍等...",
|
"182": "正在分享文档,请稍等...",
|
||||||
"183": "正在校验索引文档树 [%d/%d %s]",
|
"183": "正在校验索引文档树 [%d/%d %s]",
|
||||||
"184": "由<a href=\"https://b3log.org/siyuan\" target=\"_blank\">思源笔记</a>强力驱动",
|
"184": "由<a href=\"https://b3log.org/siyuan\" target=\"_blank\">思源笔记</a>强力驱动",
|
||||||
"185": "索引校验完毕",
|
"185": "索引校验完毕"
|
||||||
"186": "已识别资源文件 [%s] 图片中的文本 [%s]"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ func main() {
|
||||||
model.BootSyncData()
|
model.BootSyncData()
|
||||||
model.InitBoxes()
|
model.InitBoxes()
|
||||||
model.InitFlashcards()
|
model.InitFlashcards()
|
||||||
util.LoadAssetsTexts()
|
model.LoadAssetsTexts()
|
||||||
|
|
||||||
go model.AutoGenerateDocHistory()
|
go model.AutoGenerateDocHistory()
|
||||||
go model.AutoSync()
|
go model.AutoSync()
|
||||||
|
|
@ -53,8 +53,8 @@ func main() {
|
||||||
go treenode.AutoFlushBlockTree()
|
go treenode.AutoFlushBlockTree()
|
||||||
go cache.LoadAssets()
|
go cache.LoadAssets()
|
||||||
go model.AutoFixIndex()
|
go model.AutoFixIndex()
|
||||||
go util.AutoOCRAssets()
|
go model.AutoOCRAssets()
|
||||||
go util.AutoFlushAssetsTexts()
|
go model.AutoFlushAssetsTexts()
|
||||||
go model.HookDesktopUIProc()
|
go model.HookDesktopUIProc()
|
||||||
model.WatchAssets()
|
model.WatchAssets()
|
||||||
model.HandleSignal()
|
model.HandleSignal()
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,7 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
|
||||||
model.BootSyncData()
|
model.BootSyncData()
|
||||||
model.InitBoxes()
|
model.InitBoxes()
|
||||||
model.InitFlashcards()
|
model.InitFlashcards()
|
||||||
util.LoadAssetsTexts()
|
model.LoadAssetsTexts()
|
||||||
|
|
||||||
go model.AutoGenerateDocHistory()
|
go model.AutoGenerateDocHistory()
|
||||||
go model.AutoSync()
|
go model.AutoSync()
|
||||||
|
|
@ -67,8 +67,8 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
|
||||||
go treenode.AutoFlushBlockTree()
|
go treenode.AutoFlushBlockTree()
|
||||||
go cache.LoadAssets()
|
go cache.LoadAssets()
|
||||||
go model.AutoFixIndex()
|
go model.AutoFixIndex()
|
||||||
go util.AutoOCRAssets()
|
go model.AutoOCRAssets()
|
||||||
go util.AutoFlushAssetsTexts()
|
go model.AutoFlushAssetsTexts()
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -197,7 +197,6 @@ func NetImg2LocalAssets(rootID string) (err error) {
|
||||||
if err = writeJSONQueue(tree); nil != err {
|
if err = writeJSONQueue(tree); nil != err {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
sql.WaitForWritingDatabase()
|
|
||||||
util.PushUpdateMsg(msgId, fmt.Sprintf(Conf.Language(120), files), 5000)
|
util.PushUpdateMsg(msgId, fmt.Sprintf(Conf.Language(120), files), 5000)
|
||||||
} else {
|
} else {
|
||||||
util.PushUpdateMsg(msgId, Conf.Language(121), 3000)
|
util.PushUpdateMsg(msgId, Conf.Language(121), 3000)
|
||||||
|
|
|
||||||
|
|
@ -428,7 +428,7 @@ func Close(force bool, execInstallPkg int) (exitCode int) {
|
||||||
Conf.Close()
|
Conf.Close()
|
||||||
sql.CloseDatabase()
|
sql.CloseDatabase()
|
||||||
treenode.SaveBlockTree(false)
|
treenode.SaveBlockTree(false)
|
||||||
util.SaveAssetsTexts()
|
SaveAssetsTexts()
|
||||||
clearWorkspaceTemp()
|
clearWorkspaceTemp()
|
||||||
clearPortJSON()
|
clearPortJSON()
|
||||||
util.UnlockWorkspace()
|
util.UnlockWorkspace()
|
||||||
|
|
|
||||||
184
kernel/model/ocr.go
Normal file
184
kernel/model/ocr.go
Normal file
|
|
@ -0,0 +1,184 @@
|
||||||
|
package model
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/dustin/go-humanize"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"runtime/debug"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/88250/gulu"
|
||||||
|
"github.com/panjf2000/ants/v2"
|
||||||
|
"github.com/siyuan-note/logging"
|
||||||
|
"github.com/siyuan-note/siyuan/kernel/cache"
|
||||||
|
"github.com/siyuan-note/siyuan/kernel/util"
|
||||||
|
)
|
||||||
|
|
||||||
|
func AutoOCRAssets() {
|
||||||
|
if !util.TesseractEnabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
autoOCRAssets()
|
||||||
|
time.Sleep(7 * time.Second)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func autoOCRAssets() {
|
||||||
|
defer logging.Recover()
|
||||||
|
|
||||||
|
assetsPath := util.GetDataAssetsAbsPath()
|
||||||
|
assets := getUnOCRAssetsAbsPaths()
|
||||||
|
|
||||||
|
poolSize := runtime.NumCPU()
|
||||||
|
if 4 < poolSize {
|
||||||
|
poolSize = 4
|
||||||
|
}
|
||||||
|
waitGroup := &sync.WaitGroup{}
|
||||||
|
p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) {
|
||||||
|
defer waitGroup.Done()
|
||||||
|
|
||||||
|
assetAbsPath := arg.(string)
|
||||||
|
text := util.Tesseract(assetAbsPath)
|
||||||
|
p := strings.TrimPrefix(assetAbsPath, assetsPath)
|
||||||
|
p = "assets" + filepath.ToSlash(p)
|
||||||
|
util.AssetsTextsLock.Lock()
|
||||||
|
util.AssetsTexts[p] = text
|
||||||
|
util.AssetsTextsLock.Unlock()
|
||||||
|
util.AssetsTextsChanged = true
|
||||||
|
})
|
||||||
|
for _, assetAbsPath := range assets {
|
||||||
|
waitGroup.Add(1)
|
||||||
|
p.Invoke(assetAbsPath)
|
||||||
|
}
|
||||||
|
waitGroup.Wait()
|
||||||
|
p.Release()
|
||||||
|
|
||||||
|
cleanNotFoundAssetsTexts()
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanNotFoundAssetsTexts() {
|
||||||
|
tmp := util.AssetsTexts
|
||||||
|
|
||||||
|
assetsPath := util.GetDataAssetsAbsPath()
|
||||||
|
var toRemoves []string
|
||||||
|
for asset, _ := range tmp {
|
||||||
|
assetAbsPath := strings.TrimPrefix(asset, "assets")
|
||||||
|
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
|
||||||
|
if !gulu.File.IsExist(assetAbsPath) {
|
||||||
|
toRemoves = append(toRemoves, asset)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
util.AssetsTextsLock.Lock()
|
||||||
|
for _, asset := range toRemoves {
|
||||||
|
delete(util.AssetsTexts, asset)
|
||||||
|
util.AssetsTextsChanged = true
|
||||||
|
}
|
||||||
|
util.AssetsTextsLock.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func getUnOCRAssetsAbsPaths() (ret []string) {
|
||||||
|
var assetsPaths []string
|
||||||
|
assets := cache.GetAssets()
|
||||||
|
for _, asset := range assets {
|
||||||
|
lowerName := strings.ToLower(asset.Path)
|
||||||
|
if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
assetsPaths = append(assetsPaths, asset.Path)
|
||||||
|
}
|
||||||
|
|
||||||
|
assetsPath := util.GetDataAssetsAbsPath()
|
||||||
|
assetsTextsTmp := util.AssetsTexts
|
||||||
|
for _, assetPath := range assetsPaths {
|
||||||
|
if _, ok := assetsTextsTmp[assetPath]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
absPath := filepath.Join(assetsPath, strings.TrimPrefix(assetPath, "assets"))
|
||||||
|
ret = append(ret, absPath)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func AutoFlushAssetsTexts() {
|
||||||
|
for {
|
||||||
|
SaveAssetsTexts()
|
||||||
|
time.Sleep(7 * time.Second)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadAssetsTexts() {
|
||||||
|
assetsPath := util.GetDataAssetsAbsPath()
|
||||||
|
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
|
||||||
|
if !gulu.File.IsExist(assetsTextsPath) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
var err error
|
||||||
|
fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
|
||||||
|
if nil != err {
|
||||||
|
logging.LogErrorf("open assets texts failed: %s", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer fh.Close()
|
||||||
|
|
||||||
|
data, err := io.ReadAll(fh)
|
||||||
|
if nil != err {
|
||||||
|
logging.LogErrorf("read assets texts failed: %s", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
util.AssetsTextsLock.Lock()
|
||||||
|
if err = gulu.JSON.UnmarshalJSON(data, &util.AssetsTexts); nil != err {
|
||||||
|
logging.LogErrorf("unmarshal assets texts failed: %s", err)
|
||||||
|
if err = os.RemoveAll(assetsTextsPath); nil != err {
|
||||||
|
logging.LogErrorf("removed corrupted assets texts failed: %s", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
util.AssetsTextsLock.Unlock()
|
||||||
|
debug.FreeOSMemory()
|
||||||
|
|
||||||
|
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
|
||||||
|
logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaveAssetsTexts() {
|
||||||
|
if !util.AssetsTextsChanged {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
util.AssetsTextsLock.Lock()
|
||||||
|
data, err := gulu.JSON.MarshalIndentJSON(util.AssetsTexts, "", " ")
|
||||||
|
if nil != err {
|
||||||
|
logging.LogErrorf("marshal assets texts failed: %s", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
util.AssetsTextsLock.Unlock()
|
||||||
|
|
||||||
|
assetsPath := util.GetDataAssetsAbsPath()
|
||||||
|
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
|
||||||
|
if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
|
||||||
|
logging.LogErrorf("write assets texts failed: %s", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
debug.FreeOSMemory()
|
||||||
|
|
||||||
|
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
|
||||||
|
logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
util.AssetsTextsChanged = false
|
||||||
|
}
|
||||||
|
|
@ -18,6 +18,7 @@ package treenode
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
util2 "github.com/siyuan-note/siyuan/kernel/util"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
|
@ -31,7 +32,6 @@ import (
|
||||||
"github.com/88250/lute/render"
|
"github.com/88250/lute/render"
|
||||||
"github.com/88250/lute/util"
|
"github.com/88250/lute/util"
|
||||||
"github.com/siyuan-note/logging"
|
"github.com/siyuan-note/logging"
|
||||||
util2 "github.com/siyuan-note/siyuan/kernel/util"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetBlockRef(n *ast.Node) (blockRefID, blockRefText, blockRefSubtype string) {
|
func GetBlockRef(n *ast.Node) (blockRefID, blockRefText, blockRefSubtype string) {
|
||||||
|
|
|
||||||
|
|
@ -1,338 +0,0 @@
|
||||||
// SiYuan - Build Your Eternal Digital Garden
|
|
||||||
// Copyright (c) 2020-present, b3log.org
|
|
||||||
//
|
|
||||||
// This program is free software: you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU Affero General Public License as published by
|
|
||||||
// the Free Software Foundation, either version 3 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU Affero General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU Affero General Public License
|
|
||||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
||||||
|
|
||||||
package util
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"io"
|
|
||||||
"os"
|
|
||||||
"os/exec"
|
|
||||||
"path/filepath"
|
|
||||||
"regexp"
|
|
||||||
"runtime"
|
|
||||||
"runtime/debug"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/88250/gulu"
|
|
||||||
"github.com/dustin/go-humanize"
|
|
||||||
"github.com/panjf2000/ants/v2"
|
|
||||||
"github.com/siyuan-note/logging"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
tesseractEnabled bool
|
|
||||||
tesseractLangs []string
|
|
||||||
assetsTexts = map[string]string{}
|
|
||||||
assetsTextsLock = sync.Mutex{}
|
|
||||||
assetsTextsChanged = false
|
|
||||||
)
|
|
||||||
|
|
||||||
func GetAssetText(asset string) string {
|
|
||||||
assetsTextsLock.Lock()
|
|
||||||
ret, ok := assetsTexts[asset]
|
|
||||||
assetsTextsLock.Unlock()
|
|
||||||
if ok {
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
assetsPath := GetDataAssetsAbsPath()
|
|
||||||
assetAbsPath := strings.TrimPrefix(asset, "assets")
|
|
||||||
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
|
|
||||||
ret = Tesseract(assetAbsPath)
|
|
||||||
assetsTextsLock.Lock()
|
|
||||||
assetsTexts[asset] = ret
|
|
||||||
assetsTextsLock.Unlock()
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func Tesseract(imgAbsPath string) string {
|
|
||||||
if ContainerStd != Container || !tesseractEnabled {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
info, err := os.Stat(imgAbsPath)
|
|
||||||
if nil != err {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
defer logging.Recover()
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
now := time.Now()
|
|
||||||
cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+"))
|
|
||||||
gulu.CmdAttr(cmd)
|
|
||||||
output, err := cmd.CombinedOutput()
|
|
||||||
if ctx.Err() == context.DeadlineExceeded {
|
|
||||||
logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
if nil != err {
|
|
||||||
logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
ret := string(output)
|
|
||||||
ret = strings.ReplaceAll(ret, "\r", "")
|
|
||||||
ret = strings.ReplaceAll(ret, "\n", "")
|
|
||||||
ret = strings.ReplaceAll(ret, "\t", " ")
|
|
||||||
reg := regexp.MustCompile("\\s{2,}")
|
|
||||||
ret = reg.ReplaceAllString(ret, " ")
|
|
||||||
logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func AutoOCRAssets() {
|
|
||||||
if !tesseractEnabled {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
for {
|
|
||||||
autoOCRAssets()
|
|
||||||
time.Sleep(7 * time.Second)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func autoOCRAssets() {
|
|
||||||
defer logging.Recover()
|
|
||||||
|
|
||||||
assetsPath := GetDataAssetsAbsPath()
|
|
||||||
assets := getUnOCRAssetsAbsPaths()
|
|
||||||
|
|
||||||
poolSize := runtime.NumCPU()
|
|
||||||
if 4 < poolSize {
|
|
||||||
poolSize = 4
|
|
||||||
}
|
|
||||||
waitGroup := &sync.WaitGroup{}
|
|
||||||
p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) {
|
|
||||||
defer waitGroup.Done()
|
|
||||||
|
|
||||||
assetAbsPath := arg.(string)
|
|
||||||
text := Tesseract(assetAbsPath)
|
|
||||||
p := strings.TrimPrefix(assetAbsPath, assetsPath)
|
|
||||||
p = "assets" + filepath.ToSlash(p)
|
|
||||||
assetsTextsLock.Lock()
|
|
||||||
assetsTexts[p] = text
|
|
||||||
assetsTextsLock.Unlock()
|
|
||||||
assetsTextsChanged = true
|
|
||||||
})
|
|
||||||
for _, assetAbsPath := range assets {
|
|
||||||
waitGroup.Add(1)
|
|
||||||
p.Invoke(assetAbsPath)
|
|
||||||
}
|
|
||||||
waitGroup.Wait()
|
|
||||||
p.Release()
|
|
||||||
|
|
||||||
cleanNotFoundAssetsTexts()
|
|
||||||
}
|
|
||||||
|
|
||||||
func cleanNotFoundAssetsTexts() {
|
|
||||||
tmp := assetsTexts
|
|
||||||
|
|
||||||
assetsPath := GetDataAssetsAbsPath()
|
|
||||||
var toRemoves []string
|
|
||||||
for asset, _ := range tmp {
|
|
||||||
assetAbsPath := strings.TrimPrefix(asset, "assets")
|
|
||||||
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
|
|
||||||
if !gulu.File.IsExist(assetAbsPath) {
|
|
||||||
toRemoves = append(toRemoves, asset)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assetsTextsLock.Lock()
|
|
||||||
for _, asset := range toRemoves {
|
|
||||||
delete(assetsTexts, asset)
|
|
||||||
assetsTextsChanged = true
|
|
||||||
}
|
|
||||||
assetsTextsLock.Unlock()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func getUnOCRAssetsAbsPaths() (ret []string) {
|
|
||||||
assetsPath := GetDataAssetsAbsPath()
|
|
||||||
var assetsPaths []string
|
|
||||||
filepath.Walk(assetsPath, func(path string, info os.FileInfo, err error) error {
|
|
||||||
name := info.Name()
|
|
||||||
if info.IsDir() {
|
|
||||||
if strings.HasPrefix(name, ".") {
|
|
||||||
return filepath.SkipDir
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
lowerName := strings.ToLower(name)
|
|
||||||
if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
assetsPaths = append(assetsPaths, path)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
|
|
||||||
assetsTextsTmp := assetsTexts
|
|
||||||
for _, absPath := range assetsPaths {
|
|
||||||
p := strings.TrimPrefix(absPath, assetsPath)
|
|
||||||
p = "assets" + filepath.ToSlash(p)
|
|
||||||
if _, ok := assetsTextsTmp[p]; ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ret = append(ret, absPath)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func AutoFlushAssetsTexts() {
|
|
||||||
for {
|
|
||||||
SaveAssetsTexts()
|
|
||||||
time.Sleep(7 * time.Second)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func LoadAssetsTexts() {
|
|
||||||
assetsPath := GetDataAssetsAbsPath()
|
|
||||||
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
|
|
||||||
if !gulu.File.IsExist(assetsTextsPath) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
start := time.Now()
|
|
||||||
var err error
|
|
||||||
fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
|
|
||||||
if nil != err {
|
|
||||||
logging.LogErrorf("open assets texts failed: %s", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer fh.Close()
|
|
||||||
|
|
||||||
data, err := io.ReadAll(fh)
|
|
||||||
if nil != err {
|
|
||||||
logging.LogErrorf("read assets texts failed: %s", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
assetsTextsLock.Lock()
|
|
||||||
if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); nil != err {
|
|
||||||
logging.LogErrorf("unmarshal assets texts failed: %s", err)
|
|
||||||
if err = os.RemoveAll(assetsTextsPath); nil != err {
|
|
||||||
logging.LogErrorf("removed corrupted assets texts failed: %s", err)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
assetsTextsLock.Unlock()
|
|
||||||
debug.FreeOSMemory()
|
|
||||||
|
|
||||||
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
|
|
||||||
logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func SaveAssetsTexts() {
|
|
||||||
if !assetsTextsChanged {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
start := time.Now()
|
|
||||||
|
|
||||||
assetsTextsLock.Lock()
|
|
||||||
data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ")
|
|
||||||
if nil != err {
|
|
||||||
logging.LogErrorf("marshal assets texts failed: %s", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
assetsTextsLock.Unlock()
|
|
||||||
|
|
||||||
assetsPath := GetDataAssetsAbsPath()
|
|
||||||
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
|
|
||||||
if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
|
|
||||||
logging.LogErrorf("write assets texts failed: %s", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
debug.FreeOSMemory()
|
|
||||||
|
|
||||||
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
|
|
||||||
logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
|
|
||||||
}
|
|
||||||
|
|
||||||
assetsTextsChanged = false
|
|
||||||
}
|
|
||||||
|
|
||||||
func initTesseract() {
|
|
||||||
ver := getTesseractVer()
|
|
||||||
if "" == ver {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
tesseractLangs = getTesseractLangs()
|
|
||||||
if 1 > len(tesseractLangs) {
|
|
||||||
logging.LogWarnf("no tesseract langs found")
|
|
||||||
tesseractEnabled = false
|
|
||||||
return
|
|
||||||
}
|
|
||||||
logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+"))
|
|
||||||
}
|
|
||||||
|
|
||||||
func getTesseractVer() (ret string) {
|
|
||||||
if ContainerStd != Container {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
cmd := exec.Command("tesseract", "--version")
|
|
||||||
gulu.CmdAttr(cmd)
|
|
||||||
data, err := cmd.CombinedOutput()
|
|
||||||
if nil == err && strings.HasPrefix(string(data), "tesseract ") {
|
|
||||||
parts := bytes.Split(data, []byte("\n"))
|
|
||||||
if 0 < len(parts) {
|
|
||||||
ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
|
|
||||||
ret = strings.TrimSpace(ret)
|
|
||||||
tesseractEnabled = true
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func getTesseractLangs() (ret []string) {
|
|
||||||
if !tesseractEnabled {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
cmd := exec.Command("tesseract", "--list-langs")
|
|
||||||
gulu.CmdAttr(cmd)
|
|
||||||
data, err := cmd.CombinedOutput()
|
|
||||||
if nil != err {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
parts := bytes.Split(data, []byte("\n"))
|
|
||||||
if 0 < len(parts) {
|
|
||||||
parts = parts[1:]
|
|
||||||
}
|
|
||||||
for _, part := range parts {
|
|
||||||
part = bytes.TrimSpace(part)
|
|
||||||
if 0 == len(part) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ret = append(ret, string(part))
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
162
kernel/util/tesseract.go
Normal file
162
kernel/util/tesseract.go
Normal file
|
|
@ -0,0 +1,162 @@
|
||||||
|
// SiYuan - Build Your Eternal Digital Garden
|
||||||
|
// Copyright (c) 2020-present, b3log.org
|
||||||
|
//
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Affero General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Affero General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
package util
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/88250/gulu"
|
||||||
|
"github.com/siyuan-note/logging"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
TesseractEnabled bool
|
||||||
|
AssetsTexts = map[string]string{}
|
||||||
|
AssetsTextsLock = sync.Mutex{}
|
||||||
|
AssetsTextsChanged = false
|
||||||
|
|
||||||
|
tesseractLangs []string
|
||||||
|
)
|
||||||
|
|
||||||
|
func GetAssetText(asset string) string {
|
||||||
|
AssetsTextsLock.Lock()
|
||||||
|
ret, ok := AssetsTexts[asset]
|
||||||
|
AssetsTextsLock.Unlock()
|
||||||
|
if ok {
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
assetsPath := GetDataAssetsAbsPath()
|
||||||
|
assetAbsPath := strings.TrimPrefix(asset, "assets")
|
||||||
|
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
|
||||||
|
ret = Tesseract(assetAbsPath)
|
||||||
|
AssetsTextsLock.Lock()
|
||||||
|
AssetsTexts[asset] = ret
|
||||||
|
AssetsTextsLock.Unlock()
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
func Tesseract(imgAbsPath string) string {
|
||||||
|
if ContainerStd != Container || !TesseractEnabled {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
info, err := os.Stat(imgAbsPath)
|
||||||
|
if nil != err {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
defer logging.Recover()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+"))
|
||||||
|
gulu.CmdAttr(cmd)
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if ctx.Err() == context.DeadlineExceeded {
|
||||||
|
logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
if nil != err {
|
||||||
|
logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
ret := string(output)
|
||||||
|
ret = strings.ReplaceAll(ret, "\r", "")
|
||||||
|
ret = strings.ReplaceAll(ret, "\n", "")
|
||||||
|
ret = strings.ReplaceAll(ret, "\t", " ")
|
||||||
|
reg := regexp.MustCompile("\\s{2,}")
|
||||||
|
ret = reg.ReplaceAllString(ret, " ")
|
||||||
|
logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
|
||||||
|
msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret)
|
||||||
|
PushStatusBar(msg)
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
func initTesseract() {
|
||||||
|
ver := getTesseractVer()
|
||||||
|
if "" == ver {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
tesseractLangs = getTesseractLangs()
|
||||||
|
if 1 > len(tesseractLangs) {
|
||||||
|
logging.LogWarnf("no tesseract langs found")
|
||||||
|
TesseractEnabled = false
|
||||||
|
return
|
||||||
|
}
|
||||||
|
logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func getTesseractVer() (ret string) {
|
||||||
|
if ContainerStd != Container {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd := exec.Command("tesseract", "--version")
|
||||||
|
gulu.CmdAttr(cmd)
|
||||||
|
data, err := cmd.CombinedOutput()
|
||||||
|
if nil == err && strings.HasPrefix(string(data), "tesseract ") {
|
||||||
|
parts := bytes.Split(data, []byte("\n"))
|
||||||
|
if 0 < len(parts) {
|
||||||
|
ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
|
||||||
|
ret = strings.TrimSpace(ret)
|
||||||
|
TesseractEnabled = true
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func getTesseractLangs() (ret []string) {
|
||||||
|
if !TesseractEnabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd := exec.Command("tesseract", "--list-langs")
|
||||||
|
gulu.CmdAttr(cmd)
|
||||||
|
data, err := cmd.CombinedOutput()
|
||||||
|
if nil != err {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := bytes.Split(data, []byte("\n"))
|
||||||
|
if 0 < len(parts) {
|
||||||
|
parts = parts[1:]
|
||||||
|
}
|
||||||
|
for _, part := range parts {
|
||||||
|
part = bytes.TrimSpace(part)
|
||||||
|
if 0 == len(part) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ret = append(ret, string(part))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue