From 9404330f2319e6307693827d41efb7fdaa96fd29 Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Mon, 6 Feb 2023 17:39:49 +0800 Subject: [PATCH] =?UTF-8?q?:art:=20Tesseract=20OCR=20=E5=8A=A0=E9=94=81?= =?UTF-8?q?=E4=B8=B2=E8=A1=8C=E6=89=A7=E8=A1=8C=E6=8F=90=E5=8D=87=E7=A8=B3?= =?UTF-8?q?=E5=AE=9A=E6=80=A7=20Fix=20https://github.com/siyuan-note/siyua?= =?UTF-8?q?n/issues/7265?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/model/ocr.go | 25 +++---------------------- kernel/util/tesseract.go | 5 +++++ 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/kernel/model/ocr.go b/kernel/model/ocr.go index ef18d8f34..8ebeb4f61 100644 --- a/kernel/model/ocr.go +++ b/kernel/model/ocr.go @@ -1,21 +1,18 @@ package model import ( - "github.com/siyuan-note/siyuan/kernel/task" "io" "os" "path/filepath" - "runtime" "runtime/debug" "strings" - "sync" "time" "github.com/88250/gulu" "github.com/dustin/go-humanize" - "github.com/panjf2000/ants/v2" "github.com/siyuan-note/logging" "github.com/siyuan-note/siyuan/kernel/cache" + "github.com/siyuan-note/siyuan/kernel/task" "github.com/siyuan-note/siyuan/kernel/util" ) @@ -33,15 +30,7 @@ func autoOCRAssets() { assetsPath := util.GetDataAssetsAbsPath() assets := getUnOCRAssetsAbsPaths() if 0 < len(assets) { - poolSize := runtime.NumCPU() - if 2 < poolSize { - poolSize = 2 - } - waitGroup := &sync.WaitGroup{} - p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) { - defer waitGroup.Done() - - assetAbsPath := arg.(string) + for i, assetAbsPath := range assets { text := util.Tesseract(assetAbsPath) p := strings.TrimPrefix(assetAbsPath, assetsPath) p = "assets" + filepath.ToSlash(p) @@ -49,19 +38,11 @@ func autoOCRAssets() { util.AssetsTexts[p] = text util.AssetsTextsLock.Unlock() util.AssetsTextsChanged = true - }) - for i, assetAbsPath := range assets { - waitGroup.Add(1) - p.Invoke(assetAbsPath) - - if 63 <= i { // 一次任务中最多处理 64 张图片,防止卡顿 + if 16 <= i { // 一次任务中最多处理 16 张图片,防止卡顿 break } } - - waitGroup.Wait() - p.Release() } cleanNotExistAssetsTexts() diff --git a/kernel/util/tesseract.go b/kernel/util/tesseract.go index be6a1132b..05969fb3b 100644 --- a/kernel/util/tesseract.go +++ b/kernel/util/tesseract.go @@ -64,12 +64,17 @@ func IsTesseractExtractable(p string) bool { return strings.HasSuffix(lowerName, ".png") || strings.HasSuffix(lowerName, ".jpg") || strings.HasSuffix(lowerName, ".jpeg") } +// tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265 +var tesseractOCRLock = sync.Mutex{} + func Tesseract(imgAbsPath string) string { if ContainerStd != Container || !TesseractEnabled { return "" } defer logging.Recover() + tesseractOCRLock.Lock() + defer tesseractOCRLock.Unlock() if !IsTesseractExtractable(imgAbsPath) { return ""