From c4208596e4d0044f073a7197c058e8abf5011189 Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Mon, 16 Jan 2023 22:26:38 +0800 Subject: [PATCH] =?UTF-8?q?:art:=20=E6=A1=8C=E9=9D=A2=E7=AB=AF=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E6=90=9C=E7=B4=A2=E5=9B=BE=E7=89=87=20OCR=20=E6=96=87?= =?UTF-8?q?=E6=9C=AC=20https://github.com/siyuan-note/siyuan/issues/3470?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/appearance/langs/en_US.json | 3 +- app/appearance/langs/es_ES.json | 3 +- app/appearance/langs/fr_FR.json | 3 +- app/appearance/langs/zh_CHT.json | 7 +- app/appearance/langs/zh_CN.json | 3 +- kernel/main.go | 6 +- kernel/mobile/kernel.go | 6 +- kernel/model/assets.go | 1 - kernel/model/conf.go | 2 +- kernel/model/ocr.go | 184 +++++++++++++++++ kernel/treenode/node.go | 2 +- kernel/util/ocr.go | 338 ------------------------------- kernel/util/tesseract.go | 162 +++++++++++++++ 13 files changed, 361 insertions(+), 359 deletions(-) create mode 100644 kernel/model/ocr.go delete mode 100644 kernel/util/ocr.go create mode 100644 kernel/util/tesseract.go diff --git a/app/appearance/langs/en_US.json b/app/appearance/langs/en_US.json index abcf46ab4..3091b6ac9 100644 --- a/app/appearance/langs/en_US.json +++ b/app/appearance/langs/en_US.json @@ -1060,7 +1060,6 @@ "182": "Sharing document, please wait...", "183": "Validating index document tree [%d/%d %s]", "184": "Powered by SiYuan", - "185": "Index verification complete", - "186": "Extracted text [%s] from asset [%s]" + "185": "Index verification complete" } } diff --git a/app/appearance/langs/es_ES.json b/app/appearance/langs/es_ES.json index b91a5e8ba..5903477a6 100644 --- a/app/appearance/langs/es_ES.json +++ b/app/appearance/langs/es_ES.json @@ -1060,7 +1060,6 @@ "182": "Compartiendo documento, por favor espere...", "183": "Validando el árbol del documento de índice [%d/%d %s]", "184": "Con la tecnología de SiYuan", - "185": "Verificación de índice completada", - "186": "Texto extraído [%s] del recurso [%s]" + "185": "Verificación de índice completada" } } diff --git a/app/appearance/langs/fr_FR.json b/app/appearance/langs/fr_FR.json index cacb593c7..8eee2d599 100644 --- a/app/appearance/langs/fr_FR.json +++ b/app/appearance/langs/fr_FR.json @@ -1060,7 +1060,6 @@ "182": "Partage du document, veuillez patienter...", "183": "Validation de l'arborescence du document d'index [%d/%d %s]", "184": "Propulsé par SiYuan", - "185": "Vérification de l'index terminée", - "186": "Texte extrait [%s] de l'actif [%s]" + "185": "Vérification de l'index terminée" } } diff --git a/app/appearance/langs/zh_CHT.json b/app/appearance/langs/zh_CHT.json index 808a410ea..890f68e79 100644 --- a/app/appearance/langs/zh_CHT.json +++ b/app/appearance/langs/zh_CHT.json @@ -31,8 +31,8 @@ "leftRightLayout": "左右佈局", "topBottomLayout": "上下佈局", "keyword": "關鍵字", - "searchMethod":"搜索方式", - "regex":"正則表達式", + "searchMethod": "搜索方式", + "regex": "正則表達式", "keywordsLimit": "關鍵字數量限制", "exportAsImage": "導出為圖片", "exportBySiYuan": "由思源筆記導出", @@ -1060,7 +1060,6 @@ "182": "正在分享文檔,請稍等...", "183": "正在校驗索引文檔樹 [%d/%d %s]", "184": "由思源筆記強力驅動", - "185": "索引校驗完畢", - "186": "已提取資源文件 [%s] 圖片中的文本 [%s]" + "185": "索引校驗完畢" } } diff --git a/app/appearance/langs/zh_CN.json b/app/appearance/langs/zh_CN.json index 3e745d3ea..e1dbecf27 100644 --- a/app/appearance/langs/zh_CN.json +++ b/app/appearance/langs/zh_CN.json @@ -1060,7 +1060,6 @@ "182": "正在分享文档,请稍等...", "183": "正在校验索引文档树 [%d/%d %s]", "184": "由思源笔记强力驱动", - "185": "索引校验完毕", - "186": "已识别资源文件 [%s] 图片中的文本 [%s]" + "185": "索引校验完毕" } } diff --git a/kernel/main.go b/kernel/main.go index cd4df3734..20c1a98f9 100644 --- a/kernel/main.go +++ b/kernel/main.go @@ -40,7 +40,7 @@ func main() { model.BootSyncData() model.InitBoxes() model.InitFlashcards() - util.LoadAssetsTexts() + model.LoadAssetsTexts() go model.AutoGenerateDocHistory() go model.AutoSync() @@ -53,8 +53,8 @@ func main() { go treenode.AutoFlushBlockTree() go cache.LoadAssets() go model.AutoFixIndex() - go util.AutoOCRAssets() - go util.AutoFlushAssetsTexts() + go model.AutoOCRAssets() + go model.AutoFlushAssetsTexts() go model.HookDesktopUIProc() model.WatchAssets() model.HandleSignal() diff --git a/kernel/mobile/kernel.go b/kernel/mobile/kernel.go index bc41becad..fa3ef2690 100644 --- a/kernel/mobile/kernel.go +++ b/kernel/mobile/kernel.go @@ -54,7 +54,7 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang model.BootSyncData() model.InitBoxes() model.InitFlashcards() - util.LoadAssetsTexts() + model.LoadAssetsTexts() go model.AutoGenerateDocHistory() go model.AutoSync() @@ -67,8 +67,8 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang go treenode.AutoFlushBlockTree() go cache.LoadAssets() go model.AutoFixIndex() - go util.AutoOCRAssets() - go util.AutoFlushAssetsTexts() + go model.AutoOCRAssets() + go model.AutoFlushAssetsTexts() }() } diff --git a/kernel/model/assets.go b/kernel/model/assets.go index d3f820477..66c0e177b 100644 --- a/kernel/model/assets.go +++ b/kernel/model/assets.go @@ -197,7 +197,6 @@ func NetImg2LocalAssets(rootID string) (err error) { if err = writeJSONQueue(tree); nil != err { return } - sql.WaitForWritingDatabase() util.PushUpdateMsg(msgId, fmt.Sprintf(Conf.Language(120), files), 5000) } else { util.PushUpdateMsg(msgId, Conf.Language(121), 3000) diff --git a/kernel/model/conf.go b/kernel/model/conf.go index 941c585d3..f96e50907 100644 --- a/kernel/model/conf.go +++ b/kernel/model/conf.go @@ -428,7 +428,7 @@ func Close(force bool, execInstallPkg int) (exitCode int) { Conf.Close() sql.CloseDatabase() treenode.SaveBlockTree(false) - util.SaveAssetsTexts() + SaveAssetsTexts() clearWorkspaceTemp() clearPortJSON() util.UnlockWorkspace() diff --git a/kernel/model/ocr.go b/kernel/model/ocr.go new file mode 100644 index 000000000..19cc77dae --- /dev/null +++ b/kernel/model/ocr.go @@ -0,0 +1,184 @@ +package model + +import ( + "github.com/dustin/go-humanize" + "io" + "os" + "path/filepath" + "runtime" + "runtime/debug" + "strings" + "sync" + "time" + + "github.com/88250/gulu" + "github.com/panjf2000/ants/v2" + "github.com/siyuan-note/logging" + "github.com/siyuan-note/siyuan/kernel/cache" + "github.com/siyuan-note/siyuan/kernel/util" +) + +func AutoOCRAssets() { + if !util.TesseractEnabled { + return + } + + for { + autoOCRAssets() + time.Sleep(7 * time.Second) + } +} + +func autoOCRAssets() { + defer logging.Recover() + + assetsPath := util.GetDataAssetsAbsPath() + assets := getUnOCRAssetsAbsPaths() + + poolSize := runtime.NumCPU() + if 4 < poolSize { + poolSize = 4 + } + waitGroup := &sync.WaitGroup{} + p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) { + defer waitGroup.Done() + + assetAbsPath := arg.(string) + text := util.Tesseract(assetAbsPath) + p := strings.TrimPrefix(assetAbsPath, assetsPath) + p = "assets" + filepath.ToSlash(p) + util.AssetsTextsLock.Lock() + util.AssetsTexts[p] = text + util.AssetsTextsLock.Unlock() + util.AssetsTextsChanged = true + }) + for _, assetAbsPath := range assets { + waitGroup.Add(1) + p.Invoke(assetAbsPath) + } + waitGroup.Wait() + p.Release() + + cleanNotFoundAssetsTexts() +} + +func cleanNotFoundAssetsTexts() { + tmp := util.AssetsTexts + + assetsPath := util.GetDataAssetsAbsPath() + var toRemoves []string + for asset, _ := range tmp { + assetAbsPath := strings.TrimPrefix(asset, "assets") + assetAbsPath = filepath.Join(assetsPath, assetAbsPath) + if !gulu.File.IsExist(assetAbsPath) { + toRemoves = append(toRemoves, asset) + } + } + + util.AssetsTextsLock.Lock() + for _, asset := range toRemoves { + delete(util.AssetsTexts, asset) + util.AssetsTextsChanged = true + } + util.AssetsTextsLock.Unlock() + return +} + +func getUnOCRAssetsAbsPaths() (ret []string) { + var assetsPaths []string + assets := cache.GetAssets() + for _, asset := range assets { + lowerName := strings.ToLower(asset.Path) + if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") { + continue + } + assetsPaths = append(assetsPaths, asset.Path) + } + + assetsPath := util.GetDataAssetsAbsPath() + assetsTextsTmp := util.AssetsTexts + for _, assetPath := range assetsPaths { + if _, ok := assetsTextsTmp[assetPath]; ok { + continue + } + absPath := filepath.Join(assetsPath, strings.TrimPrefix(assetPath, "assets")) + ret = append(ret, absPath) + } + return +} + +func AutoFlushAssetsTexts() { + for { + SaveAssetsTexts() + time.Sleep(7 * time.Second) + } +} + +func LoadAssetsTexts() { + assetsPath := util.GetDataAssetsAbsPath() + assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") + if !gulu.File.IsExist(assetsTextsPath) { + return + } + + start := time.Now() + var err error + fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644) + if nil != err { + logging.LogErrorf("open assets texts failed: %s", err) + return + } + defer fh.Close() + + data, err := io.ReadAll(fh) + if nil != err { + logging.LogErrorf("read assets texts failed: %s", err) + return + } + + util.AssetsTextsLock.Lock() + if err = gulu.JSON.UnmarshalJSON(data, &util.AssetsTexts); nil != err { + logging.LogErrorf("unmarshal assets texts failed: %s", err) + if err = os.RemoveAll(assetsTextsPath); nil != err { + logging.LogErrorf("removed corrupted assets texts failed: %s", err) + } + return + } + util.AssetsTextsLock.Unlock() + debug.FreeOSMemory() + + if elapsed := time.Since(start).Seconds(); 2 < elapsed { + logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed) + } + return +} + +func SaveAssetsTexts() { + if !util.AssetsTextsChanged { + return + } + + start := time.Now() + + util.AssetsTextsLock.Lock() + data, err := gulu.JSON.MarshalIndentJSON(util.AssetsTexts, "", " ") + if nil != err { + logging.LogErrorf("marshal assets texts failed: %s", err) + return + } + util.AssetsTextsLock.Unlock() + + assetsPath := util.GetDataAssetsAbsPath() + assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") + if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err { + logging.LogErrorf("write assets texts failed: %s", err) + return + } + debug.FreeOSMemory() + + if elapsed := time.Since(start).Seconds(); 2 < elapsed { + logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed) + } + + util.AssetsTextsChanged = false +} diff --git a/kernel/treenode/node.go b/kernel/treenode/node.go index 1a01071f3..c662ca3a2 100644 --- a/kernel/treenode/node.go +++ b/kernel/treenode/node.go @@ -18,6 +18,7 @@ package treenode import ( "bytes" + util2 "github.com/siyuan-note/siyuan/kernel/util" "strings" "sync" @@ -31,7 +32,6 @@ import ( "github.com/88250/lute/render" "github.com/88250/lute/util" "github.com/siyuan-note/logging" - util2 "github.com/siyuan-note/siyuan/kernel/util" ) func GetBlockRef(n *ast.Node) (blockRefID, blockRefText, blockRefSubtype string) { diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go deleted file mode 100644 index 722b39def..000000000 --- a/kernel/util/ocr.go +++ /dev/null @@ -1,338 +0,0 @@ -// SiYuan - Build Your Eternal Digital Garden -// Copyright (c) 2020-present, b3log.org -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package util - -import ( - "bytes" - "context" - "io" - "os" - "os/exec" - "path/filepath" - "regexp" - "runtime" - "runtime/debug" - "strings" - "sync" - "time" - - "github.com/88250/gulu" - "github.com/dustin/go-humanize" - "github.com/panjf2000/ants/v2" - "github.com/siyuan-note/logging" -) - -var ( - tesseractEnabled bool - tesseractLangs []string - assetsTexts = map[string]string{} - assetsTextsLock = sync.Mutex{} - assetsTextsChanged = false -) - -func GetAssetText(asset string) string { - assetsTextsLock.Lock() - ret, ok := assetsTexts[asset] - assetsTextsLock.Unlock() - if ok { - return ret - } - - assetsPath := GetDataAssetsAbsPath() - assetAbsPath := strings.TrimPrefix(asset, "assets") - assetAbsPath = filepath.Join(assetsPath, assetAbsPath) - ret = Tesseract(assetAbsPath) - assetsTextsLock.Lock() - assetsTexts[asset] = ret - assetsTextsLock.Unlock() - return ret -} - -func Tesseract(imgAbsPath string) string { - if ContainerStd != Container || !tesseractEnabled { - return "" - } - - info, err := os.Stat(imgAbsPath) - if nil != err { - return "" - } - - defer logging.Recover() - - ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second) - defer cancel() - - now := time.Now() - cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+")) - gulu.CmdAttr(cmd) - output, err := cmd.CombinedOutput() - if ctx.Err() == context.DeadlineExceeded { - logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size()) - return "" - } - - if nil != err { - logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err) - return "" - } - - ret := string(output) - ret = strings.ReplaceAll(ret, "\r", "") - ret = strings.ReplaceAll(ret, "\n", "") - ret = strings.ReplaceAll(ret, "\t", " ") - reg := regexp.MustCompile("\\s{2,}") - ret = reg.ReplaceAllString(ret, " ") - logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds()) - return ret -} - -func AutoOCRAssets() { - if !tesseractEnabled { - return - } - - for { - autoOCRAssets() - time.Sleep(7 * time.Second) - } -} - -func autoOCRAssets() { - defer logging.Recover() - - assetsPath := GetDataAssetsAbsPath() - assets := getUnOCRAssetsAbsPaths() - - poolSize := runtime.NumCPU() - if 4 < poolSize { - poolSize = 4 - } - waitGroup := &sync.WaitGroup{} - p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) { - defer waitGroup.Done() - - assetAbsPath := arg.(string) - text := Tesseract(assetAbsPath) - p := strings.TrimPrefix(assetAbsPath, assetsPath) - p = "assets" + filepath.ToSlash(p) - assetsTextsLock.Lock() - assetsTexts[p] = text - assetsTextsLock.Unlock() - assetsTextsChanged = true - }) - for _, assetAbsPath := range assets { - waitGroup.Add(1) - p.Invoke(assetAbsPath) - } - waitGroup.Wait() - p.Release() - - cleanNotFoundAssetsTexts() -} - -func cleanNotFoundAssetsTexts() { - tmp := assetsTexts - - assetsPath := GetDataAssetsAbsPath() - var toRemoves []string - for asset, _ := range tmp { - assetAbsPath := strings.TrimPrefix(asset, "assets") - assetAbsPath = filepath.Join(assetsPath, assetAbsPath) - if !gulu.File.IsExist(assetAbsPath) { - toRemoves = append(toRemoves, asset) - } - } - - assetsTextsLock.Lock() - for _, asset := range toRemoves { - delete(assetsTexts, asset) - assetsTextsChanged = true - } - assetsTextsLock.Unlock() - return -} - -func getUnOCRAssetsAbsPaths() (ret []string) { - assetsPath := GetDataAssetsAbsPath() - var assetsPaths []string - filepath.Walk(assetsPath, func(path string, info os.FileInfo, err error) error { - name := info.Name() - if info.IsDir() { - if strings.HasPrefix(name, ".") { - return filepath.SkipDir - } - return nil - } - - lowerName := strings.ToLower(name) - if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") { - return nil - } - - assetsPaths = append(assetsPaths, path) - return nil - }) - - assetsTextsTmp := assetsTexts - for _, absPath := range assetsPaths { - p := strings.TrimPrefix(absPath, assetsPath) - p = "assets" + filepath.ToSlash(p) - if _, ok := assetsTextsTmp[p]; ok { - continue - } - ret = append(ret, absPath) - } - return -} - -func AutoFlushAssetsTexts() { - for { - SaveAssetsTexts() - time.Sleep(7 * time.Second) - } -} - -func LoadAssetsTexts() { - assetsPath := GetDataAssetsAbsPath() - assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") - if !gulu.File.IsExist(assetsTextsPath) { - return - } - - start := time.Now() - var err error - fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644) - if nil != err { - logging.LogErrorf("open assets texts failed: %s", err) - return - } - defer fh.Close() - - data, err := io.ReadAll(fh) - if nil != err { - logging.LogErrorf("read assets texts failed: %s", err) - return - } - - assetsTextsLock.Lock() - if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); nil != err { - logging.LogErrorf("unmarshal assets texts failed: %s", err) - if err = os.RemoveAll(assetsTextsPath); nil != err { - logging.LogErrorf("removed corrupted assets texts failed: %s", err) - } - return - } - assetsTextsLock.Unlock() - debug.FreeOSMemory() - - if elapsed := time.Since(start).Seconds(); 2 < elapsed { - logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed) - } - return -} - -func SaveAssetsTexts() { - if !assetsTextsChanged { - return - } - - start := time.Now() - - assetsTextsLock.Lock() - data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ") - if nil != err { - logging.LogErrorf("marshal assets texts failed: %s", err) - return - } - assetsTextsLock.Unlock() - - assetsPath := GetDataAssetsAbsPath() - assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") - if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err { - logging.LogErrorf("write assets texts failed: %s", err) - return - } - debug.FreeOSMemory() - - if elapsed := time.Since(start).Seconds(); 2 < elapsed { - logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed) - } - - assetsTextsChanged = false -} - -func initTesseract() { - ver := getTesseractVer() - if "" == ver { - return - } - - tesseractLangs = getTesseractLangs() - if 1 > len(tesseractLangs) { - logging.LogWarnf("no tesseract langs found") - tesseractEnabled = false - return - } - logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+")) -} - -func getTesseractVer() (ret string) { - if ContainerStd != Container { - return - } - - cmd := exec.Command("tesseract", "--version") - gulu.CmdAttr(cmd) - data, err := cmd.CombinedOutput() - if nil == err && strings.HasPrefix(string(data), "tesseract ") { - parts := bytes.Split(data, []byte("\n")) - if 0 < len(parts) { - ret = strings.TrimPrefix(string(parts[0]), "tesseract ") - ret = strings.TrimSpace(ret) - tesseractEnabled = true - } - return - } - return -} - -func getTesseractLangs() (ret []string) { - if !tesseractEnabled { - return nil - } - - cmd := exec.Command("tesseract", "--list-langs") - gulu.CmdAttr(cmd) - data, err := cmd.CombinedOutput() - if nil != err { - return nil - } - - parts := bytes.Split(data, []byte("\n")) - if 0 < len(parts) { - parts = parts[1:] - } - for _, part := range parts { - part = bytes.TrimSpace(part) - if 0 == len(part) { - continue - } - ret = append(ret, string(part)) - } - return -} diff --git a/kernel/util/tesseract.go b/kernel/util/tesseract.go new file mode 100644 index 000000000..c61625969 --- /dev/null +++ b/kernel/util/tesseract.go @@ -0,0 +1,162 @@ +// SiYuan - Build Your Eternal Digital Garden +// Copyright (c) 2020-present, b3log.org +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package util + +import ( + "bytes" + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "sync" + "time" + + "github.com/88250/gulu" + "github.com/siyuan-note/logging" +) + +var ( + TesseractEnabled bool + AssetsTexts = map[string]string{} + AssetsTextsLock = sync.Mutex{} + AssetsTextsChanged = false + + tesseractLangs []string +) + +func GetAssetText(asset string) string { + AssetsTextsLock.Lock() + ret, ok := AssetsTexts[asset] + AssetsTextsLock.Unlock() + if ok { + return ret + } + + assetsPath := GetDataAssetsAbsPath() + assetAbsPath := strings.TrimPrefix(asset, "assets") + assetAbsPath = filepath.Join(assetsPath, assetAbsPath) + ret = Tesseract(assetAbsPath) + AssetsTextsLock.Lock() + AssetsTexts[asset] = ret + AssetsTextsLock.Unlock() + return ret +} + +func Tesseract(imgAbsPath string) string { + if ContainerStd != Container || !TesseractEnabled { + return "" + } + + info, err := os.Stat(imgAbsPath) + if nil != err { + return "" + } + + defer logging.Recover() + + ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second) + defer cancel() + + now := time.Now() + cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+")) + gulu.CmdAttr(cmd) + output, err := cmd.CombinedOutput() + if ctx.Err() == context.DeadlineExceeded { + logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size()) + return "" + } + + if nil != err { + logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err) + return "" + } + + ret := string(output) + ret = strings.ReplaceAll(ret, "\r", "") + ret = strings.ReplaceAll(ret, "\n", "") + ret = strings.ReplaceAll(ret, "\t", " ") + reg := regexp.MustCompile("\\s{2,}") + ret = reg.ReplaceAllString(ret, " ") + logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds()) + msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret) + PushStatusBar(msg) + return ret +} + +func initTesseract() { + ver := getTesseractVer() + if "" == ver { + return + } + + tesseractLangs = getTesseractLangs() + if 1 > len(tesseractLangs) { + logging.LogWarnf("no tesseract langs found") + TesseractEnabled = false + return + } + logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+")) +} + +func getTesseractVer() (ret string) { + if ContainerStd != Container { + return + } + + cmd := exec.Command("tesseract", "--version") + gulu.CmdAttr(cmd) + data, err := cmd.CombinedOutput() + if nil == err && strings.HasPrefix(string(data), "tesseract ") { + parts := bytes.Split(data, []byte("\n")) + if 0 < len(parts) { + ret = strings.TrimPrefix(string(parts[0]), "tesseract ") + ret = strings.TrimSpace(ret) + TesseractEnabled = true + } + return + } + return +} + +func getTesseractLangs() (ret []string) { + if !TesseractEnabled { + return nil + } + + cmd := exec.Command("tesseract", "--list-langs") + gulu.CmdAttr(cmd) + data, err := cmd.CombinedOutput() + if nil != err { + return nil + } + + parts := bytes.Split(data, []byte("\n")) + if 0 < len(parts) { + parts = parts[1:] + } + for _, part := range parts { + part = bytes.TrimSpace(part) + if 0 == len(part) { + continue + } + ret = append(ret, string(part)) + } + return +}