diff --git a/kernel/main.go b/kernel/main.go index 7f018c110..b162d8882 100644 --- a/kernel/main.go +++ b/kernel/main.go @@ -42,7 +42,7 @@ func main() { model.BootSyncData() model.InitBoxes() model.LoadFlashcards() - model.LoadAssetsTexts() + util.LoadAssetsTexts() util.SetBooted() util.PushClearAllMsg() diff --git a/kernel/model/conf.go b/kernel/model/conf.go index 104863088..5a78f7850 100644 --- a/kernel/model/conf.go +++ b/kernel/model/conf.go @@ -621,7 +621,7 @@ func Close(force, setCurrentWorkspace bool, execInstallPkg int) (exitCode int) { Conf.Close() sql.CloseDatabase() treenode.SaveBlockTree(false) - SaveAssetsTexts() + util.SaveAssetsTexts() clearWorkspaceTemp() clearCorruptedNotebooks() clearPortJSON() diff --git a/kernel/model/ocr.go b/kernel/model/ocr.go index c08236959..61e9c92ed 100644 --- a/kernel/model/ocr.go +++ b/kernel/model/ocr.go @@ -2,13 +2,9 @@ package model import ( "path/filepath" - "runtime/debug" "strings" "time" - "github.com/88250/go-humanize" - "github.com/88250/gulu" - "github.com/siyuan-note/filelock" "github.com/siyuan-note/logging" "github.com/siyuan-note/siyuan/kernel/cache" "github.com/siyuan-note/siyuan/kernel/sql" @@ -40,19 +36,14 @@ func autoOCRAssets() { text := util.Tesseract(assetAbsPath) p := strings.TrimPrefix(assetAbsPath, assetsPath) p = "assets" + filepath.ToSlash(p) - util.AssetsTextsLock.Lock() - util.AssetsTexts[p] = text - util.AssetsTextsLock.Unlock() - if "" != text { - util.AssetsTextsChanged.Store(true) - } + util.SetAssetText(p, text) if 7 <= i { // 一次任务中最多处理 7 张图片,防止长时间占用系统资源 break } } } - cleanNotExistAssetsTexts() + util.CleanNotExistAssetsTexts() // 刷新 OCR 结果到数据库 util.NodeOCRQueueLock.Lock() @@ -63,27 +54,6 @@ func autoOCRAssets() { util.NodeOCRQueue = nil } -func cleanNotExistAssetsTexts() { - util.AssetsTextsLock.Lock() - defer util.AssetsTextsLock.Unlock() - - assetsPath := util.GetDataAssetsAbsPath() - var toRemoves []string - for asset, _ := range util.AssetsTexts { - assetAbsPath := strings.TrimPrefix(asset, "assets") - assetAbsPath = filepath.Join(assetsPath, assetAbsPath) - if !filelock.IsExist(assetAbsPath) { - toRemoves = append(toRemoves, asset) - } - } - - for _, asset := range toRemoves { - delete(util.AssetsTexts, asset) - util.AssetsTextsChanged.Store(true) - } - return -} - func getUnOCRAssetsAbsPaths() (ret []string) { var assetsPaths []string assets := cache.GetAssets() @@ -95,9 +65,8 @@ func getUnOCRAssetsAbsPaths() (ret []string) { } assetsPath := util.GetDataAssetsAbsPath() - assetsTextsTmp := util.AssetsTexts for _, assetPath := range assetsPaths { - if _, ok := assetsTextsTmp[assetPath]; ok { + if util.ExistsAssetText(assetPath) { continue } absPath := filepath.Join(assetsPath, strings.TrimPrefix(assetPath, "assets")) @@ -107,66 +76,5 @@ func getUnOCRAssetsAbsPaths() (ret []string) { } func FlushAssetsTextsJob() { - SaveAssetsTexts() -} - -func LoadAssetsTexts() { - assetsPath := util.GetDataAssetsAbsPath() - assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") - if !filelock.IsExist(assetsTextsPath) { - return - } - - start := time.Now() - data, err := filelock.ReadFile(assetsTextsPath) - if nil != err { - logging.LogErrorf("read assets texts failed: %s", err) - return - } - - util.AssetsTextsLock.Lock() - if err = gulu.JSON.UnmarshalJSON(data, &util.AssetsTexts); nil != err { - logging.LogErrorf("unmarshal assets texts failed: %s", err) - if err = filelock.Remove(assetsTextsPath); nil != err { - logging.LogErrorf("removed corrupted assets texts failed: %s", err) - } - return - } - util.AssetsTextsLock.Unlock() - debug.FreeOSMemory() - - if elapsed := time.Since(start).Seconds(); 2 < elapsed { - logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed) - } - return -} - -func SaveAssetsTexts() { - if !util.AssetsTextsChanged.Load() { - return - } - - start := time.Now() - - util.AssetsTextsLock.Lock() - data, err := gulu.JSON.MarshalIndentJSON(util.AssetsTexts, "", " ") - if nil != err { - logging.LogErrorf("marshal assets texts failed: %s", err) - return - } - util.AssetsTextsLock.Unlock() - - assetsPath := util.GetDataAssetsAbsPath() - assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") - if err = filelock.WriteFile(assetsTextsPath, data); nil != err { - logging.LogErrorf("write assets texts failed: %s", err) - return - } - debug.FreeOSMemory() - - if elapsed := time.Since(start).Seconds(); 2 < elapsed { - logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed) - } - - util.AssetsTextsChanged.Store(false) + util.SaveAssetsTexts() } diff --git a/kernel/model/repository.go b/kernel/model/repository.go index 53530a50e..f7cf064e3 100644 --- a/kernel/model/repository.go +++ b/kernel/model/repository.go @@ -1451,7 +1451,7 @@ func processSyncMergeResult(exit, byHand bool, mergeResult *dejavu.MergeResult, } if needReloadOcrTexts { - LoadAssetsTexts() + util.LoadAssetsTexts() } if needReloadPlugin { diff --git a/kernel/util/tesseract.go b/kernel/util/ocr.go similarity index 69% rename from kernel/util/tesseract.go rename to kernel/util/ocr.go index 256be5fd3..b9598a7bc 100644 --- a/kernel/util/tesseract.go +++ b/kernel/util/ocr.go @@ -23,6 +23,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime/debug" "strconv" "strings" "sync" @@ -33,39 +34,124 @@ import ( "github.com/88250/gulu" "github.com/88250/lute/ast" "github.com/88250/lute/html" + "github.com/siyuan-note/filelock" "github.com/siyuan-note/logging" ) var ( - TesseractBin = "tesseract" - TesseractEnabled bool - TesseractMaxSize = 2 * 1000 * uint64(1000) - AssetsTexts = map[string]string{} - AssetsTextsLock = sync.Mutex{} - AssetsTextsChanged = atomic.Bool{} + TesseractBin = "tesseract" + TesseractEnabled bool + TesseractMaxSize = 2 * 1000 * uint64(1000) + TesseractLangs []string - TesseractLangs []string + assetsTexts = map[string]string{} + assetsTextsLock = sync.Mutex{} + assetsTextsChanged = atomic.Bool{} ) +func CleanNotExistAssetsTexts() { + assetsTextsLock.Lock() + defer assetsTextsLock.Unlock() + + assetsPath := GetDataAssetsAbsPath() + var toRemoves []string + for asset, _ := range assetsTexts { + assetAbsPath := strings.TrimPrefix(asset, "assets") + assetAbsPath = filepath.Join(assetsPath, assetAbsPath) + if !filelock.IsExist(assetAbsPath) { + toRemoves = append(toRemoves, asset) + } + } + + for _, asset := range toRemoves { + delete(assetsTexts, asset) + assetsTextsChanged.Store(true) + } + return +} + +func LoadAssetsTexts() { + assetsPath := GetDataAssetsAbsPath() + assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") + if !filelock.IsExist(assetsTextsPath) { + return + } + + start := time.Now() + data, err := filelock.ReadFile(assetsTextsPath) + if nil != err { + logging.LogErrorf("read assets texts failed: %s", err) + return + } + + assetsTextsLock.Lock() + if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); nil != err { + logging.LogErrorf("unmarshal assets texts failed: %s", err) + if err = filelock.Remove(assetsTextsPath); nil != err { + logging.LogErrorf("removed corrupted assets texts failed: %s", err) + } + return + } + assetsTextsLock.Unlock() + debug.FreeOSMemory() + + if elapsed := time.Since(start).Seconds(); 2 < elapsed { + logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed) + } + return +} + +func SaveAssetsTexts() { + if !assetsTextsChanged.Load() || !TesseractEnabled { + return + } + + start := time.Now() + + assetsTextsLock.Lock() + data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ") + if nil != err { + logging.LogErrorf("marshal assets texts failed: %s", err) + return + } + assetsTextsLock.Unlock() + + assetsPath := GetDataAssetsAbsPath() + assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") + if err = filelock.WriteFile(assetsTextsPath, data); nil != err { + logging.LogErrorf("write assets texts failed: %s", err) + return + } + debug.FreeOSMemory() + + if elapsed := time.Since(start).Seconds(); 2 < elapsed { + logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed) + } + + assetsTextsChanged.Store(false) +} + func SetAssetText(asset, text string) { - AssetsTextsLock.Lock() - AssetsTexts[asset] = text - AssetsTextsLock.Unlock() - AssetsTextsChanged.Store(true) + assetsTextsLock.Lock() + assetsTexts[asset] = text + assetsTextsLock.Unlock() + if "" != text { + assetsTextsChanged.Store(true) + } } func ExistsAssetText(asset string) (ret bool) { - AssetsTextsLock.Lock() - _, ret = AssetsTexts[asset] - AssetsTextsLock.Unlock() + assetsTextsLock.Lock() + _, ret = assetsTexts[asset] + assetsTextsLock.Unlock() return } func GetAssetText(asset string, force bool) (ret string) { if !force { - AssetsTextsLock.Lock() - ret = AssetsTexts[asset] - AssetsTextsLock.Unlock() + assetsTextsLock.Lock() + ret = assetsTexts[asset] + assetsTextsLock.Unlock() return } @@ -73,11 +159,11 @@ func GetAssetText(asset string, force bool) (ret string) { assetAbsPath := strings.TrimPrefix(asset, "assets") assetAbsPath = filepath.Join(assetsPath, assetAbsPath) ret = Tesseract(assetAbsPath) - AssetsTextsLock.Lock() - AssetsTexts[asset] = ret - AssetsTextsLock.Unlock() + assetsTextsLock.Lock() + assetsTexts[asset] = ret + assetsTextsLock.Unlock() if "" != ret { - AssetsTextsChanged.Store(true) + assetsTextsChanged.Store(true) } return }