diff --git a/app/appearance/langs/en_US.json b/app/appearance/langs/en_US.json
index abcf46ab4..3091b6ac9 100644
--- a/app/appearance/langs/en_US.json
+++ b/app/appearance/langs/en_US.json
@@ -1060,7 +1060,6 @@
"182": "Sharing document, please wait...",
"183": "Validating index document tree [%d/%d %s]",
"184": "Powered by SiYuan",
- "185": "Index verification complete",
- "186": "Extracted text [%s] from asset [%s]"
+ "185": "Index verification complete"
}
}
diff --git a/app/appearance/langs/es_ES.json b/app/appearance/langs/es_ES.json
index b91a5e8ba..5903477a6 100644
--- a/app/appearance/langs/es_ES.json
+++ b/app/appearance/langs/es_ES.json
@@ -1060,7 +1060,6 @@
"182": "Compartiendo documento, por favor espere...",
"183": "Validando el árbol del documento de índice [%d/%d %s]",
"184": "Con la tecnología de SiYuan",
- "185": "Verificación de índice completada",
- "186": "Texto extraído [%s] del recurso [%s]"
+ "185": "Verificación de índice completada"
}
}
diff --git a/app/appearance/langs/fr_FR.json b/app/appearance/langs/fr_FR.json
index cacb593c7..8eee2d599 100644
--- a/app/appearance/langs/fr_FR.json
+++ b/app/appearance/langs/fr_FR.json
@@ -1060,7 +1060,6 @@
"182": "Partage du document, veuillez patienter...",
"183": "Validation de l'arborescence du document d'index [%d/%d %s]",
"184": "Propulsé par SiYuan",
- "185": "Vérification de l'index terminée",
- "186": "Texte extrait [%s] de l'actif [%s]"
+ "185": "Vérification de l'index terminée"
}
}
diff --git a/app/appearance/langs/zh_CHT.json b/app/appearance/langs/zh_CHT.json
index 808a410ea..890f68e79 100644
--- a/app/appearance/langs/zh_CHT.json
+++ b/app/appearance/langs/zh_CHT.json
@@ -31,8 +31,8 @@
"leftRightLayout": "左右佈局",
"topBottomLayout": "上下佈局",
"keyword": "關鍵字",
- "searchMethod":"搜索方式",
- "regex":"正則表達式",
+ "searchMethod": "搜索方式",
+ "regex": "正則表達式",
"keywordsLimit": "關鍵字數量限制",
"exportAsImage": "導出為圖片",
"exportBySiYuan": "由思源筆記導出",
@@ -1060,7 +1060,6 @@
"182": "正在分享文檔,請稍等...",
"183": "正在校驗索引文檔樹 [%d/%d %s]",
"184": "由思源筆記強力驅動",
- "185": "索引校驗完畢",
- "186": "已提取資源文件 [%s] 圖片中的文本 [%s]"
+ "185": "索引校驗完畢"
}
}
diff --git a/app/appearance/langs/zh_CN.json b/app/appearance/langs/zh_CN.json
index 3e745d3ea..e1dbecf27 100644
--- a/app/appearance/langs/zh_CN.json
+++ b/app/appearance/langs/zh_CN.json
@@ -1060,7 +1060,6 @@
"182": "正在分享文档,请稍等...",
"183": "正在校验索引文档树 [%d/%d %s]",
"184": "由思源笔记强力驱动",
- "185": "索引校验完毕",
- "186": "已识别资源文件 [%s] 图片中的文本 [%s]"
+ "185": "索引校验完毕"
}
}
diff --git a/kernel/main.go b/kernel/main.go
index cd4df3734..20c1a98f9 100644
--- a/kernel/main.go
+++ b/kernel/main.go
@@ -40,7 +40,7 @@ func main() {
model.BootSyncData()
model.InitBoxes()
model.InitFlashcards()
- util.LoadAssetsTexts()
+ model.LoadAssetsTexts()
go model.AutoGenerateDocHistory()
go model.AutoSync()
@@ -53,8 +53,8 @@ func main() {
go treenode.AutoFlushBlockTree()
go cache.LoadAssets()
go model.AutoFixIndex()
- go util.AutoOCRAssets()
- go util.AutoFlushAssetsTexts()
+ go model.AutoOCRAssets()
+ go model.AutoFlushAssetsTexts()
go model.HookDesktopUIProc()
model.WatchAssets()
model.HandleSignal()
diff --git a/kernel/mobile/kernel.go b/kernel/mobile/kernel.go
index bc41becad..fa3ef2690 100644
--- a/kernel/mobile/kernel.go
+++ b/kernel/mobile/kernel.go
@@ -54,7 +54,7 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
model.BootSyncData()
model.InitBoxes()
model.InitFlashcards()
- util.LoadAssetsTexts()
+ model.LoadAssetsTexts()
go model.AutoGenerateDocHistory()
go model.AutoSync()
@@ -67,8 +67,8 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
go treenode.AutoFlushBlockTree()
go cache.LoadAssets()
go model.AutoFixIndex()
- go util.AutoOCRAssets()
- go util.AutoFlushAssetsTexts()
+ go model.AutoOCRAssets()
+ go model.AutoFlushAssetsTexts()
}()
}
diff --git a/kernel/model/assets.go b/kernel/model/assets.go
index d3f820477..66c0e177b 100644
--- a/kernel/model/assets.go
+++ b/kernel/model/assets.go
@@ -197,7 +197,6 @@ func NetImg2LocalAssets(rootID string) (err error) {
if err = writeJSONQueue(tree); nil != err {
return
}
- sql.WaitForWritingDatabase()
util.PushUpdateMsg(msgId, fmt.Sprintf(Conf.Language(120), files), 5000)
} else {
util.PushUpdateMsg(msgId, Conf.Language(121), 3000)
diff --git a/kernel/model/conf.go b/kernel/model/conf.go
index 941c585d3..f96e50907 100644
--- a/kernel/model/conf.go
+++ b/kernel/model/conf.go
@@ -428,7 +428,7 @@ func Close(force bool, execInstallPkg int) (exitCode int) {
Conf.Close()
sql.CloseDatabase()
treenode.SaveBlockTree(false)
- util.SaveAssetsTexts()
+ SaveAssetsTexts()
clearWorkspaceTemp()
clearPortJSON()
util.UnlockWorkspace()
diff --git a/kernel/model/ocr.go b/kernel/model/ocr.go
new file mode 100644
index 000000000..19cc77dae
--- /dev/null
+++ b/kernel/model/ocr.go
@@ -0,0 +1,184 @@
+package model
+
+import (
+ "github.com/dustin/go-humanize"
+ "io"
+ "os"
+ "path/filepath"
+ "runtime"
+ "runtime/debug"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/88250/gulu"
+ "github.com/panjf2000/ants/v2"
+ "github.com/siyuan-note/logging"
+ "github.com/siyuan-note/siyuan/kernel/cache"
+ "github.com/siyuan-note/siyuan/kernel/util"
+)
+
+func AutoOCRAssets() {
+ if !util.TesseractEnabled {
+ return
+ }
+
+ for {
+ autoOCRAssets()
+ time.Sleep(7 * time.Second)
+ }
+}
+
+func autoOCRAssets() {
+ defer logging.Recover()
+
+ assetsPath := util.GetDataAssetsAbsPath()
+ assets := getUnOCRAssetsAbsPaths()
+
+ poolSize := runtime.NumCPU()
+ if 4 < poolSize {
+ poolSize = 4
+ }
+ waitGroup := &sync.WaitGroup{}
+ p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) {
+ defer waitGroup.Done()
+
+ assetAbsPath := arg.(string)
+ text := util.Tesseract(assetAbsPath)
+ p := strings.TrimPrefix(assetAbsPath, assetsPath)
+ p = "assets" + filepath.ToSlash(p)
+ util.AssetsTextsLock.Lock()
+ util.AssetsTexts[p] = text
+ util.AssetsTextsLock.Unlock()
+ util.AssetsTextsChanged = true
+ })
+ for _, assetAbsPath := range assets {
+ waitGroup.Add(1)
+ p.Invoke(assetAbsPath)
+ }
+ waitGroup.Wait()
+ p.Release()
+
+ cleanNotFoundAssetsTexts()
+}
+
+func cleanNotFoundAssetsTexts() {
+ tmp := util.AssetsTexts
+
+ assetsPath := util.GetDataAssetsAbsPath()
+ var toRemoves []string
+ for asset, _ := range tmp {
+ assetAbsPath := strings.TrimPrefix(asset, "assets")
+ assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
+ if !gulu.File.IsExist(assetAbsPath) {
+ toRemoves = append(toRemoves, asset)
+ }
+ }
+
+ util.AssetsTextsLock.Lock()
+ for _, asset := range toRemoves {
+ delete(util.AssetsTexts, asset)
+ util.AssetsTextsChanged = true
+ }
+ util.AssetsTextsLock.Unlock()
+ return
+}
+
+func getUnOCRAssetsAbsPaths() (ret []string) {
+ var assetsPaths []string
+ assets := cache.GetAssets()
+ for _, asset := range assets {
+ lowerName := strings.ToLower(asset.Path)
+ if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
+ continue
+ }
+ assetsPaths = append(assetsPaths, asset.Path)
+ }
+
+ assetsPath := util.GetDataAssetsAbsPath()
+ assetsTextsTmp := util.AssetsTexts
+ for _, assetPath := range assetsPaths {
+ if _, ok := assetsTextsTmp[assetPath]; ok {
+ continue
+ }
+ absPath := filepath.Join(assetsPath, strings.TrimPrefix(assetPath, "assets"))
+ ret = append(ret, absPath)
+ }
+ return
+}
+
+func AutoFlushAssetsTexts() {
+ for {
+ SaveAssetsTexts()
+ time.Sleep(7 * time.Second)
+ }
+}
+
+func LoadAssetsTexts() {
+ assetsPath := util.GetDataAssetsAbsPath()
+ assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
+ if !gulu.File.IsExist(assetsTextsPath) {
+ return
+ }
+
+ start := time.Now()
+ var err error
+ fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
+ if nil != err {
+ logging.LogErrorf("open assets texts failed: %s", err)
+ return
+ }
+ defer fh.Close()
+
+ data, err := io.ReadAll(fh)
+ if nil != err {
+ logging.LogErrorf("read assets texts failed: %s", err)
+ return
+ }
+
+ util.AssetsTextsLock.Lock()
+ if err = gulu.JSON.UnmarshalJSON(data, &util.AssetsTexts); nil != err {
+ logging.LogErrorf("unmarshal assets texts failed: %s", err)
+ if err = os.RemoveAll(assetsTextsPath); nil != err {
+ logging.LogErrorf("removed corrupted assets texts failed: %s", err)
+ }
+ return
+ }
+ util.AssetsTextsLock.Unlock()
+ debug.FreeOSMemory()
+
+ if elapsed := time.Since(start).Seconds(); 2 < elapsed {
+ logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
+ }
+ return
+}
+
+func SaveAssetsTexts() {
+ if !util.AssetsTextsChanged {
+ return
+ }
+
+ start := time.Now()
+
+ util.AssetsTextsLock.Lock()
+ data, err := gulu.JSON.MarshalIndentJSON(util.AssetsTexts, "", " ")
+ if nil != err {
+ logging.LogErrorf("marshal assets texts failed: %s", err)
+ return
+ }
+ util.AssetsTextsLock.Unlock()
+
+ assetsPath := util.GetDataAssetsAbsPath()
+ assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
+ if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
+ logging.LogErrorf("write assets texts failed: %s", err)
+ return
+ }
+ debug.FreeOSMemory()
+
+ if elapsed := time.Since(start).Seconds(); 2 < elapsed {
+ logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
+ }
+
+ util.AssetsTextsChanged = false
+}
diff --git a/kernel/treenode/node.go b/kernel/treenode/node.go
index 1a01071f3..c662ca3a2 100644
--- a/kernel/treenode/node.go
+++ b/kernel/treenode/node.go
@@ -18,6 +18,7 @@ package treenode
import (
"bytes"
+ util2 "github.com/siyuan-note/siyuan/kernel/util"
"strings"
"sync"
@@ -31,7 +32,6 @@ import (
"github.com/88250/lute/render"
"github.com/88250/lute/util"
"github.com/siyuan-note/logging"
- util2 "github.com/siyuan-note/siyuan/kernel/util"
)
func GetBlockRef(n *ast.Node) (blockRefID, blockRefText, blockRefSubtype string) {
diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go
deleted file mode 100644
index 722b39def..000000000
--- a/kernel/util/ocr.go
+++ /dev/null
@@ -1,338 +0,0 @@
-// SiYuan - Build Your Eternal Digital Garden
-// Copyright (c) 2020-present, b3log.org
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
-package util
-
-import (
- "bytes"
- "context"
- "io"
- "os"
- "os/exec"
- "path/filepath"
- "regexp"
- "runtime"
- "runtime/debug"
- "strings"
- "sync"
- "time"
-
- "github.com/88250/gulu"
- "github.com/dustin/go-humanize"
- "github.com/panjf2000/ants/v2"
- "github.com/siyuan-note/logging"
-)
-
-var (
- tesseractEnabled bool
- tesseractLangs []string
- assetsTexts = map[string]string{}
- assetsTextsLock = sync.Mutex{}
- assetsTextsChanged = false
-)
-
-func GetAssetText(asset string) string {
- assetsTextsLock.Lock()
- ret, ok := assetsTexts[asset]
- assetsTextsLock.Unlock()
- if ok {
- return ret
- }
-
- assetsPath := GetDataAssetsAbsPath()
- assetAbsPath := strings.TrimPrefix(asset, "assets")
- assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
- ret = Tesseract(assetAbsPath)
- assetsTextsLock.Lock()
- assetsTexts[asset] = ret
- assetsTextsLock.Unlock()
- return ret
-}
-
-func Tesseract(imgAbsPath string) string {
- if ContainerStd != Container || !tesseractEnabled {
- return ""
- }
-
- info, err := os.Stat(imgAbsPath)
- if nil != err {
- return ""
- }
-
- defer logging.Recover()
-
- ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
- defer cancel()
-
- now := time.Now()
- cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+"))
- gulu.CmdAttr(cmd)
- output, err := cmd.CombinedOutput()
- if ctx.Err() == context.DeadlineExceeded {
- logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
- return ""
- }
-
- if nil != err {
- logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
- return ""
- }
-
- ret := string(output)
- ret = strings.ReplaceAll(ret, "\r", "")
- ret = strings.ReplaceAll(ret, "\n", "")
- ret = strings.ReplaceAll(ret, "\t", " ")
- reg := regexp.MustCompile("\\s{2,}")
- ret = reg.ReplaceAllString(ret, " ")
- logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
- return ret
-}
-
-func AutoOCRAssets() {
- if !tesseractEnabled {
- return
- }
-
- for {
- autoOCRAssets()
- time.Sleep(7 * time.Second)
- }
-}
-
-func autoOCRAssets() {
- defer logging.Recover()
-
- assetsPath := GetDataAssetsAbsPath()
- assets := getUnOCRAssetsAbsPaths()
-
- poolSize := runtime.NumCPU()
- if 4 < poolSize {
- poolSize = 4
- }
- waitGroup := &sync.WaitGroup{}
- p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) {
- defer waitGroup.Done()
-
- assetAbsPath := arg.(string)
- text := Tesseract(assetAbsPath)
- p := strings.TrimPrefix(assetAbsPath, assetsPath)
- p = "assets" + filepath.ToSlash(p)
- assetsTextsLock.Lock()
- assetsTexts[p] = text
- assetsTextsLock.Unlock()
- assetsTextsChanged = true
- })
- for _, assetAbsPath := range assets {
- waitGroup.Add(1)
- p.Invoke(assetAbsPath)
- }
- waitGroup.Wait()
- p.Release()
-
- cleanNotFoundAssetsTexts()
-}
-
-func cleanNotFoundAssetsTexts() {
- tmp := assetsTexts
-
- assetsPath := GetDataAssetsAbsPath()
- var toRemoves []string
- for asset, _ := range tmp {
- assetAbsPath := strings.TrimPrefix(asset, "assets")
- assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
- if !gulu.File.IsExist(assetAbsPath) {
- toRemoves = append(toRemoves, asset)
- }
- }
-
- assetsTextsLock.Lock()
- for _, asset := range toRemoves {
- delete(assetsTexts, asset)
- assetsTextsChanged = true
- }
- assetsTextsLock.Unlock()
- return
-}
-
-func getUnOCRAssetsAbsPaths() (ret []string) {
- assetsPath := GetDataAssetsAbsPath()
- var assetsPaths []string
- filepath.Walk(assetsPath, func(path string, info os.FileInfo, err error) error {
- name := info.Name()
- if info.IsDir() {
- if strings.HasPrefix(name, ".") {
- return filepath.SkipDir
- }
- return nil
- }
-
- lowerName := strings.ToLower(name)
- if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
- return nil
- }
-
- assetsPaths = append(assetsPaths, path)
- return nil
- })
-
- assetsTextsTmp := assetsTexts
- for _, absPath := range assetsPaths {
- p := strings.TrimPrefix(absPath, assetsPath)
- p = "assets" + filepath.ToSlash(p)
- if _, ok := assetsTextsTmp[p]; ok {
- continue
- }
- ret = append(ret, absPath)
- }
- return
-}
-
-func AutoFlushAssetsTexts() {
- for {
- SaveAssetsTexts()
- time.Sleep(7 * time.Second)
- }
-}
-
-func LoadAssetsTexts() {
- assetsPath := GetDataAssetsAbsPath()
- assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
- if !gulu.File.IsExist(assetsTextsPath) {
- return
- }
-
- start := time.Now()
- var err error
- fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
- if nil != err {
- logging.LogErrorf("open assets texts failed: %s", err)
- return
- }
- defer fh.Close()
-
- data, err := io.ReadAll(fh)
- if nil != err {
- logging.LogErrorf("read assets texts failed: %s", err)
- return
- }
-
- assetsTextsLock.Lock()
- if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); nil != err {
- logging.LogErrorf("unmarshal assets texts failed: %s", err)
- if err = os.RemoveAll(assetsTextsPath); nil != err {
- logging.LogErrorf("removed corrupted assets texts failed: %s", err)
- }
- return
- }
- assetsTextsLock.Unlock()
- debug.FreeOSMemory()
-
- if elapsed := time.Since(start).Seconds(); 2 < elapsed {
- logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
- }
- return
-}
-
-func SaveAssetsTexts() {
- if !assetsTextsChanged {
- return
- }
-
- start := time.Now()
-
- assetsTextsLock.Lock()
- data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ")
- if nil != err {
- logging.LogErrorf("marshal assets texts failed: %s", err)
- return
- }
- assetsTextsLock.Unlock()
-
- assetsPath := GetDataAssetsAbsPath()
- assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
- if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
- logging.LogErrorf("write assets texts failed: %s", err)
- return
- }
- debug.FreeOSMemory()
-
- if elapsed := time.Since(start).Seconds(); 2 < elapsed {
- logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
- }
-
- assetsTextsChanged = false
-}
-
-func initTesseract() {
- ver := getTesseractVer()
- if "" == ver {
- return
- }
-
- tesseractLangs = getTesseractLangs()
- if 1 > len(tesseractLangs) {
- logging.LogWarnf("no tesseract langs found")
- tesseractEnabled = false
- return
- }
- logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+"))
-}
-
-func getTesseractVer() (ret string) {
- if ContainerStd != Container {
- return
- }
-
- cmd := exec.Command("tesseract", "--version")
- gulu.CmdAttr(cmd)
- data, err := cmd.CombinedOutput()
- if nil == err && strings.HasPrefix(string(data), "tesseract ") {
- parts := bytes.Split(data, []byte("\n"))
- if 0 < len(parts) {
- ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
- ret = strings.TrimSpace(ret)
- tesseractEnabled = true
- }
- return
- }
- return
-}
-
-func getTesseractLangs() (ret []string) {
- if !tesseractEnabled {
- return nil
- }
-
- cmd := exec.Command("tesseract", "--list-langs")
- gulu.CmdAttr(cmd)
- data, err := cmd.CombinedOutput()
- if nil != err {
- return nil
- }
-
- parts := bytes.Split(data, []byte("\n"))
- if 0 < len(parts) {
- parts = parts[1:]
- }
- for _, part := range parts {
- part = bytes.TrimSpace(part)
- if 0 == len(part) {
- continue
- }
- ret = append(ret, string(part))
- }
- return
-}
diff --git a/kernel/util/tesseract.go b/kernel/util/tesseract.go
new file mode 100644
index 000000000..c61625969
--- /dev/null
+++ b/kernel/util/tesseract.go
@@ -0,0 +1,162 @@
+// SiYuan - Build Your Eternal Digital Garden
+// Copyright (c) 2020-present, b3log.org
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package util
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "regexp"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/88250/gulu"
+ "github.com/siyuan-note/logging"
+)
+
+var (
+ TesseractEnabled bool
+ AssetsTexts = map[string]string{}
+ AssetsTextsLock = sync.Mutex{}
+ AssetsTextsChanged = false
+
+ tesseractLangs []string
+)
+
+func GetAssetText(asset string) string {
+ AssetsTextsLock.Lock()
+ ret, ok := AssetsTexts[asset]
+ AssetsTextsLock.Unlock()
+ if ok {
+ return ret
+ }
+
+ assetsPath := GetDataAssetsAbsPath()
+ assetAbsPath := strings.TrimPrefix(asset, "assets")
+ assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
+ ret = Tesseract(assetAbsPath)
+ AssetsTextsLock.Lock()
+ AssetsTexts[asset] = ret
+ AssetsTextsLock.Unlock()
+ return ret
+}
+
+func Tesseract(imgAbsPath string) string {
+ if ContainerStd != Container || !TesseractEnabled {
+ return ""
+ }
+
+ info, err := os.Stat(imgAbsPath)
+ if nil != err {
+ return ""
+ }
+
+ defer logging.Recover()
+
+ ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
+ defer cancel()
+
+ now := time.Now()
+ cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+"))
+ gulu.CmdAttr(cmd)
+ output, err := cmd.CombinedOutput()
+ if ctx.Err() == context.DeadlineExceeded {
+ logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
+ return ""
+ }
+
+ if nil != err {
+ logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
+ return ""
+ }
+
+ ret := string(output)
+ ret = strings.ReplaceAll(ret, "\r", "")
+ ret = strings.ReplaceAll(ret, "\n", "")
+ ret = strings.ReplaceAll(ret, "\t", " ")
+ reg := regexp.MustCompile("\\s{2,}")
+ ret = reg.ReplaceAllString(ret, " ")
+ logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
+ msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret)
+ PushStatusBar(msg)
+ return ret
+}
+
+func initTesseract() {
+ ver := getTesseractVer()
+ if "" == ver {
+ return
+ }
+
+ tesseractLangs = getTesseractLangs()
+ if 1 > len(tesseractLangs) {
+ logging.LogWarnf("no tesseract langs found")
+ TesseractEnabled = false
+ return
+ }
+ logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+"))
+}
+
+func getTesseractVer() (ret string) {
+ if ContainerStd != Container {
+ return
+ }
+
+ cmd := exec.Command("tesseract", "--version")
+ gulu.CmdAttr(cmd)
+ data, err := cmd.CombinedOutput()
+ if nil == err && strings.HasPrefix(string(data), "tesseract ") {
+ parts := bytes.Split(data, []byte("\n"))
+ if 0 < len(parts) {
+ ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
+ ret = strings.TrimSpace(ret)
+ TesseractEnabled = true
+ }
+ return
+ }
+ return
+}
+
+func getTesseractLangs() (ret []string) {
+ if !TesseractEnabled {
+ return nil
+ }
+
+ cmd := exec.Command("tesseract", "--list-langs")
+ gulu.CmdAttr(cmd)
+ data, err := cmd.CombinedOutput()
+ if nil != err {
+ return nil
+ }
+
+ parts := bytes.Split(data, []byte("\n"))
+ if 0 < len(parts) {
+ parts = parts[1:]
+ }
+ for _, part := range parts {
+ part = bytes.TrimSpace(part)
+ if 0 == len(part) {
+ continue
+ }
+ ret = append(ret, string(part))
+ }
+ return
+}