From daa30de3c98d4a17a44783b03dd740e5906bf302 Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Mon, 16 Jan 2023 15:02:00 +0800 Subject: [PATCH 1/3] =?UTF-8?q?:art:=20=E6=A1=8C=E9=9D=A2=E7=AB=AF?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=90=9C=E7=B4=A2=E5=9B=BE=E7=89=87=20OCR=20?= =?UTF-8?q?=E6=96=87=E6=9C=AC=20https://github.com/siyuan-note/siyuan/issu?= =?UTF-8?q?es/3470?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/util/ocr.go | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go index c0feba419..34791045b 100644 --- a/kernel/util/ocr.go +++ b/kernel/util/ocr.go @@ -31,6 +31,7 @@ import ( "github.com/88250/gulu" "github.com/dustin/go-humanize" + "github.com/panjf2000/ants/v2" "github.com/siyuan-note/logging" ) @@ -68,15 +69,11 @@ func Tesseract(imgAbsPath string) string { output, err := cmd.CombinedOutput() if ctx.Err() == context.DeadlineExceeded { logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size()) - assetsTexts[imgAbsPath] = "" - assetsTextsChanged = true return "" } if nil != err { logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err) - assetsTexts[imgAbsPath] = "" - assetsTextsChanged = true return "" } @@ -87,8 +84,6 @@ func Tesseract(imgAbsPath string) string { reg := regexp.MustCompile("\\s{2,}") ret = reg.ReplaceAllString(ret, " ") logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds()) - assetsTexts[imgAbsPath] = ret - assetsTextsChanged = true return ret } @@ -98,10 +93,30 @@ func AutoOCRAssets() { } for { + assetsPath := GetDataAssetsAbsPath() assets := getUnOCRAssetsAbsPaths() - for _, p := range assets { - Tesseract(p) + + waitGroup := &sync.WaitGroup{} + lock := &sync.Mutex{} + p, _ := ants.NewPoolWithFunc(4, func(arg interface{}) { + defer waitGroup.Done() + + assetAbsPath := arg.(string) + text := Tesseract(assetAbsPath) + p := strings.TrimPrefix(assetAbsPath, assetsPath) + p = "assets" + filepath.ToSlash(p) + lock.Lock() + assetsTexts[p] = text + lock.Unlock() + assetsTextsChanged = true + }) + for _, assetAbsPath := range assets { + waitGroup.Add(1) + p.Invoke(assetAbsPath) } + waitGroup.Wait() + p.Release() + time.Sleep(7 * time.Second) } } From d65bce4a17eb872e7e3f19783bc31139cca9f90b Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Mon, 16 Jan 2023 15:20:26 +0800 Subject: [PATCH 2/3] =?UTF-8?q?:art:=20=E6=A1=8C=E9=9D=A2=E7=AB=AF?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=90=9C=E7=B4=A2=E5=9B=BE=E7=89=87=20OCR=20?= =?UTF-8?q?=E6=96=87=E6=9C=AC=20https://github.com/siyuan-note/siyuan/issu?= =?UTF-8?q?es/3470?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/treenode/node.go | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/kernel/treenode/node.go b/kernel/treenode/node.go index 1078326be..1a01071f3 100644 --- a/kernel/treenode/node.go +++ b/kernel/treenode/node.go @@ -106,20 +106,34 @@ func NodeStaticContent(node *ast.Node, excludeTypes []string) string { } switch n.Type { - case ast.NodeLinkText: - buf.Write(n.Tokens) - - if nil != n.Parent && ast.NodeImage == n.Parent.Type { - destNode := n.Parent.ChildByType(ast.NodeLinkDest) - if nil != destNode { - // 桌面端支持搜索图片 OCR 文本 https://github.com/siyuan-note/siyuan/issues/3470 - if text := util2.GetAssetText(destNode.TokensStr()); "" != text { - buf.WriteByte(' ') - buf.WriteString(text) - } - } + case ast.NodeImage: + linkDest := n.ChildByType(ast.NodeLinkDest) + var linkDestStr, ocrText string + if nil != linkDest { + linkDestStr = linkDest.TokensStr() + ocrText = util2.GetAssetText(linkDestStr) } + linkText := n.ChildByType(ast.NodeLinkText) + if nil != linkText { + buf.Write(linkText.Tokens) + buf.WriteByte(' ') + } + if "" != ocrText { + buf.WriteString(ocrText) + buf.WriteByte(' ') + } + if nil != linkDest { + buf.Write(n.Tokens) + buf.WriteByte(' ') + + } + if linkTitle := n.ChildByType(ast.NodeLinkTitle); nil != linkTitle { + buf.Write(linkTitle.Tokens) + } + return ast.WalkSkipChildren + case ast.NodeLinkText: + buf.Write(n.Tokens) buf.WriteByte(' ') case ast.NodeLinkDest: buf.Write(n.Tokens) From 5593542f73b65749e7c5a250a9ed6aec4d02cfc2 Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Mon, 16 Jan 2023 15:39:31 +0800 Subject: [PATCH 3/3] =?UTF-8?q?:art:=20=E6=A1=8C=E9=9D=A2=E7=AB=AF?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=90=9C=E7=B4=A2=E5=9B=BE=E7=89=87=20OCR=20?= =?UTF-8?q?=E6=96=87=E6=9C=AC=20https://github.com/siyuan-note/siyuan/issu?= =?UTF-8?q?es/3470?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/util/ocr.go | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go index 34791045b..b5d9f353f 100644 --- a/kernel/util/ocr.go +++ b/kernel/util/ocr.go @@ -42,10 +42,22 @@ var ( assetsTextsChanged = false ) -func GetAssetText(assets string) string { +func GetAssetText(asset string) string { assetsTextsLock.Lock() - defer assetsTextsLock.Unlock() - return assetsTexts[assets] + ret, ok := assetsTexts[asset] + assetsTextsLock.Unlock() + if ok { + return ret + } + + assetsPath := GetDataAssetsAbsPath() + assetAbsPath := strings.TrimPrefix(asset, "assets") + assetAbsPath = filepath.Join(assetsPath, assetAbsPath) + ret = Tesseract(assetAbsPath) + assetsTextsLock.Lock() + assetsTexts[asset] = ret + assetsTextsLock.Unlock() + return ret } func Tesseract(imgAbsPath string) string {