From c0bd645048a03f188dbb9853741f3c2b9a5358f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B4=AE=E7=94=9F?= <admin@shenzilong.cn>
Date: Sun, 16 Jun 2024 22:55:22 +0800
Subject: [PATCH] Kernel API OCR returns text coordinate information (#11738)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 优化setImageOCRText接口调用

* 扩展 ocr 接口，添加 ocrJSON 返回字段

* 过滤不可见字符

* 返回的ocr文本添加空格
---
 app/src/menus/protyle.ts | 16 +++++----
 kernel/api/asset.go      |  8 ++---
 kernel/model/ocr.go      |  2 +-
 kernel/sql/block.go      |  2 +-
 kernel/util/ocr.go       | 71 ++++++++++++++++++++++++++++------------
 5 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/app/src/menus/protyle.ts b/app/src/menus/protyle.ts
index 1e59bb119..cc174de01 100644
--- a/app/src/menus/protyle.ts
+++ b/app/src/menus/protyle.ts
@@ -1019,7 +1019,9 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
                     fetchPost("/api/asset/getImageOCRText", {
                         path: imgElement.getAttribute("src")
                     }, (response) => {
-                        element.querySelector("textarea").value = response.data.text;
+                        const textarea =element.querySelector("textarea")
+                        textarea.value = response.data.text;
+                        textarea.dataset.ocrText = response.data.text;
                     });
                 }
             }, {
@@ -1031,11 +1033,6 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
                     fetchPost("/api/asset/ocr", {
                         path: imgElement.getAttribute("src"),
                         force: true
-                    }, (response) => {
-                        fetchPost("/api/asset/setImageOCRText", {
-                            path: imgElement.getAttribute("src"),
-                            text: response.data.text
-                        });
                     });
                 }
             }],
@@ -1119,6 +1116,13 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
         const textElements = window.siyuan.menus.menu.element.querySelectorAll("textarea");
         textElements[0].focus();
         window.siyuan.menus.menu.removeCB = () => {
+            const ocrElement = window.siyuan.menus.menu.element.querySelector('[data-type="ocr"]') as HTMLTextAreaElement;
+            if (ocrElement && ocrElement.dataset.ocrText !== ocrElement.value) {
+                fetchPost("/api/asset/setImageOCRText", {
+                    path: imgElement.getAttribute("src"),
+                    text: ocrElement.value
+                });
+            }
             imgElement.setAttribute("alt", textElements[2].value.replace(/\n|\r\n|\r|\u2028|\u2029/g, ""));
             nodeElement.setAttribute("updated", dayjs().format("YYYYMMDDHHmmss"));
             updateTransaction(protyle, id, nodeElement.outerHTML, html);
diff --git a/kernel/api/asset.go b/kernel/api/asset.go
index 2b3ad7223..08b11f5e1 100644
--- a/kernel/api/asset.go
+++ b/kernel/api/asset.go
@@ -137,13 +137,11 @@ func ocr(c *gin.Context) {
 	}
 
 	path := arg["path"].(string)
-	force := false
-	if forceArg := arg["force"]; nil != forceArg {
-		force = forceArg.(bool)
-	}
 
+	ocrJSON := util.OcrAsset(path)
 	ret.Data = map[string]interface{}{
-		"text": util.OcrAsset(path, force),
+		"text":    util.GetOcrJsonText(ocrJSON),
+		"ocrJSON": ocrJSON,
 	}
 }
 
diff --git a/kernel/model/ocr.go b/kernel/model/ocr.go
index 61e9c92ed..638c5ca6b 100644
--- a/kernel/model/ocr.go
+++ b/kernel/model/ocr.go
@@ -33,7 +33,7 @@ func autoOCRAssets() {
 	assets := getUnOCRAssetsAbsPaths()
 	if 0 < len(assets) {
 		for i, assetAbsPath := range assets {
-			text := util.Tesseract(assetAbsPath)
+			text := util.GetOcrJsonText(util.Tesseract(assetAbsPath))
 			p := strings.TrimPrefix(assetAbsPath, assetsPath)
 			p = "assets" + filepath.ToSlash(p)
 			util.SetAssetText(p, text)
diff --git a/kernel/sql/block.go b/kernel/sql/block.go
index f6efa0152..9266d82f0 100644
--- a/kernel/sql/block.go
+++ b/kernel/sql/block.go
@@ -198,7 +198,7 @@ func nodeStaticContent(node *ast.Node, excludeTypes []string, includeTextMarkATi
 			var linkDestStr, ocrText string
 			if nil != linkDest {
 				linkDestStr = linkDest.TokensStr()
-				ocrText = util.OcrAsset(linkDestStr, false)
+				ocrText = util.GetAssetText(linkDestStr)
 			}
 
 			linkText := n.ChildByType(ast.NodeLinkText)
diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go
index 3f2e8c3b6..2edb9bb3f 100644
--- a/kernel/util/ocr.go
+++ b/kernel/util/ocr.go
@@ -149,22 +149,16 @@ func ExistsAssetText(asset string) (ret bool) {
 	return
 }
 
-func OcrAsset(asset string, force bool) (ret string) {
-	if !force {
-		assetsTextsLock.Lock()
-		ret = assetsTexts[asset]
-		assetsTextsLock.Unlock()
-		return
-	}
-
+func OcrAsset(asset string) (ret []map[string]interface{}) {
 	assetsPath := GetDataAssetsAbsPath()
 	assetAbsPath := strings.TrimPrefix(asset, "assets")
 	assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
 	ret = Tesseract(assetAbsPath)
 	assetsTextsLock.Lock()
-	assetsTexts[asset] = ret
+	ocrText := GetOcrJsonText(ret)
+	assetsTexts[asset] = ocrText
 	assetsTextsLock.Unlock()
-	if "" != ret {
+	if "" != ocrText {
 		assetsTextsChanged.Store(true)
 	}
 	return
@@ -184,9 +178,9 @@ func IsTesseractExtractable(p string) bool {
 // tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265
 var tesseractOCRLock = sync.Mutex{}
 
-func Tesseract(imgAbsPath string) string {
+func Tesseract(imgAbsPath string) (ret []map[string]interface{}) {
 	if ContainerStd != Container || !TesseractEnabled {
-		return ""
+		return
 	}
 
 	defer logging.Recover()
@@ -194,16 +188,16 @@ func Tesseract(imgAbsPath string) string {
 	defer tesseractOCRLock.Unlock()
 
 	if !IsTesseractExtractable(imgAbsPath) {
-		return ""
+		return
 	}
 
 	info, err := os.Stat(imgAbsPath)
 	if nil != err {
-		return ""
+		return
 	}
 
 	if TesseractMaxSize < uint64(info.Size()) {
-		return ""
+		return
 	}
 
 	defer logging.Recover()
@@ -211,24 +205,59 @@ func Tesseract(imgAbsPath string) string {
 	ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
 	defer cancel()
 
-	cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"))
+	cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"), "tsv")
 	gulu.CmdAttr(cmd)
 	output, err := cmd.CombinedOutput()
 	if ctx.Err() == context.DeadlineExceeded {
 		logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
-		return ""
+		return
 	}
 
 	if nil != err {
 		logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
-		return ""
+		return
 	}
 
-	ret := string(output)
+	tsv := string(output)
+
+	// 按行分割 TSV 数据
+	lines := strings.Split(tsv, "\r\n")
+
+	// 解析 TSV 数据 跳过标题行，从第二行开始处理
+	for _, line := range lines[1:] {
+		if line == "" {
+			continue // 跳过空行
+		}
+		// 分割每列数据
+		fields := strings.Split(line, "\t")
+		// 将字段名和字段值映射到一个 map 中
+		dataMap := make(map[string]interface{})
+		for i, header := range strings.Split(lines[0], "\t") {
+			dataMap[header] = fields[i]
+		}
+		ret = append(ret, dataMap)
+	}
+
+	tsv = gulu.Str.RemoveInvisible(tsv)
+	tsv = RemoveRedundantSpace(tsv)
+	msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(GetOcrJsonText(ret)))
+	PushStatusBar(msg)
+	return
+}
+
+// 提取并连接所有 text 字段的函数
+func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) {
+	for _, dataMap := range jsonData {
+		// 检查 text 字段是否存在
+		if text, ok := dataMap["text"]; ok {
+			// 确保 text 是字符串类型
+			if textStr, ok := text.(string); ok {
+				ret += " " + textStr
+			}
+		}
+	}
 	ret = gulu.Str.RemoveInvisible(ret)
 	ret = RemoveRedundantSpace(ret)
-	msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(ret))
-	PushStatusBar(msg)
 	return ret
 }